• This notebook summarizes the results from temporal covariance analysis from CVTKPY.
  • The covariance window size used was 100k (but the window sizes did not affect the results)

1 Results from CVTKPY: genome-wide temporal covariances of allele frequencies

pops<-c("PWS","TB","SS")

covs<-data.frame()
for (p in 1: length(pops)){
    #covariance output file
    cov<-read.csv(paste0("~/Projects/Pacherring_Vincent/MD7000/3Pops_maf05_temp_cov_matrix_",pops[p],"_100k.csv"))
    cov<-cov[,-1]
        
    #reshape the matrix
    mat1<-cov[1:3,]
    mat2<-cov[4:6,]
        
    covdf<-data.frame()
    k=1
    for (i in 1:nrow(mat1)){
        for (j in 1:ncol(mat1)){
            covdf[k,1]<-mat2[i,j]
            covdf[k,2]<-mat1[i,j]
            k=k+1
        }
    }
    colnames(covdf)<-c("label","value")
    covdf$value<-as.numeric(covdf$value)
    covar<-covdf[grep("cov",covdf$label),]
        
    #remove the redundant values
    if (pops[p]!="SS") covar<-covar[!duplicated(covar[, 2]),] 
    if (pops[p]=="SS") covar<-covar[c(1,2,4),]
        
    #assign the starting time period and covering period values
    covar$year<-c(1,2,2)
    covar$series<-c("1991","1991","1996")
        
    #assign population name
    covar$location<-pops[p]
    
    #combine in to one matrix
    covs<-rbind(covs, covar)
}

covs$time<-rep(c("cov12","cov13","cov23"), 3)
colnames(covs)[2]<-"cov"

# 95% confidence intervals (calculated from the 'straps' returned from bootstrap_cov2() ci=1.96*sd(straps))
time<-c("cov12","cov13","cov23")

covs$ci<-NA
for (i in 1:length(pops)){  
     if (i!=3){
        df<-read.csv(paste0("~/Projects/Pacherring_Vincent/MD7000/",pops[i],"_CIs_100kwindow.csv"), header=F)
        covs$ci[covs$location==pops[i]&time=='cov12']<-df[1,2]
        covs$ci[covs$location==pops[i]&time=='cov13']<-df[1,3]
        covs$ci[covs$location==pops[i]&time=='cov23']<-df[2,3]
    }
    if (p==3) {
        df<-read.csv(paste0("~/Projects/Pacherring_Vincent/MD7000/",pops[i],"_CIs_100kwindow.csv"), header=F)
        covs$ci[covs$location==pops[i]&time=='cov23']<-df[1,2] 
    }
}

write.csv(covs,"../Output/COV/GW_covariance_CIs.csv")

xtexts<-c("\u03941991-1996\n ~ \u03941996-2006", "\n  ~ \u03942006-2017")

ggplot(data=covs, aes(x=year, y=cov, color=location, shape=series, group=interaction(location, series)))+
        geom_point(size=3, position=position_dodge(width = 0.1,preserve ="total"))+
        geom_line(data=covs, aes(x=year, y=cov,color=location, group=interaction(location, series)), position=position_dodge(width = 0.1,preserve ="total"))+
        ylab("Covariance")+xlab('')+theme_classic()+
        theme(legend.title = element_blank())+
        geom_hline(yintercept = 0,color="gray70", size=0.3)+
        geom_errorbar(aes(ymin=cov-ci, ymax=cov+ci), width=.2, size=.2, position=position_dodge(width = 0.1,preserve ="total"))+
        scale_shape_manual(values=c(16,17),labels=c("\u0394'91-'96~","\u0394'96-'06~"))+
        scale_x_continuous(breaks = c(1,2), labels=xtexts)+
        scale_color_manual(values=cols[c(2,3,1)])+ylim(-0.0023,0.002)
ggsave(paste0("../Output/COV/3Pops_Cov_overtime_CIestimated.png"),width = 4.7, height = 3, dpi=300)
    
covs$time<-factor(covs$time, levels=c("cov12","cov23","cov13"))
#xtexts<-c("\u03941991-1996\n ~ \u03941996-2006", "\u03941996-2006\n  ~ \u03942006-2017", "\u03941991-1996\n  ~ \u03942006-2017")
xtexts<-c("\u0394'91-'96\n ~ \u0394'96-'06", "\u0394'96-'06\n  ~ \u0394'06-'17", "\u0394'91-'96\n  ~ \u0394'06-'17")

ggplot(data=covs, aes(x=time, y=cov, color=location))+
        geom_point(size=3, position=position_dodge(width = 0.1,preserve ="total"))+
        #geom_line(data=covs, aes(x=year, y=cov,color=location, group=interaction(location, series)), position=position_dodge(width = 0.1,preserve ="total"))+
        ylab("Covariance")+xlab('')+theme_classic()+
        theme(legend.title = element_blank(), axis.text.x = element_text(size=9))+
        geom_hline(yintercept = 0,color="gray70", size=0.3)+
        geom_errorbar(aes(ymin=cov-ci, ymax=cov+ci), width=.2, size=.2, position=position_dodge(width = 0.1,preserve ="total"))+
        scale_x_discrete(labels=xtexts)+
    scale_color_manual(values=cols[c(2,3,1)])+
    geom_vline(xintercept = c(1.5,2.5), color="gray", size=0.2)+ylim(-0.0023,0.002)
ggsave(paste0("../Output/COV/3Pops_Cov_CI_3timepoints.png"),width = 4.57, height = 3, dpi=300)

2 Find regions with high covariances in each population

  • From Temporal Covariance analysis -output covariances for each time period

2.1 Plot the covariances across the genome

#Find the regions with a high temporal covariance 
pops<-c("PWS","TB","SS")
winsize<-"100k"
evens<-paste0("chr",seq(2,26, by=2))
cov.list<-list()
covs_all<-list()
k=1
for (p in 1: length(pops)){
    pop<-pops[p]
    iv<-read.csv(paste0("~/Projects/Pacherring_Vincent/MD7000/3pops_intervals_",winsize,"window.csv"), row.names = 1)
    if (p==3) {
        cov23<-read.csv(paste0("~/Projects/Pacherring_Vincent/MD7000/",pop,"_cov23_2017-2006_2006-1996_3Pops_",winsize,"window.csv"), header = F)
        covs<-cbind(iv, cov23)
        colnames(covs)[4]<-c("cov23")
        covs$index=1:nrow(covs)
        covs$color<-"col1"
        covs$color[covs$chrom %in% evens]<-"col2"

        covs[sapply(covs, is.infinite)] <- NA
        covs[sapply(covs, is.nan)] <- NA
        
        cov.list[[k]]<-covs
        names(cov.list)[k]<-paste0(pop,"_",winsize)    
        k=k+1
            
        y<-min(covs$cov23, na.rm=T)
        ymin<-ifelse (y<=-0.1,-0.1, y) 
        ymax<-max(covs$cov23, na.rm=T)
        ggplot(covs, aes(x=index, y=cov23, color=color))+
            geom_point(size=1, alpha=0.5)+
            theme_classic()+
            ylim(ymin,ymax)+
            scale_color_manual(values=c("gray70","steelblue"), guide="none")+
            ylab("Covariance")+xlab('Chromosome')+
            theme(axis.text.x = element_blank())+
            ggtitle(paste0(pop," ", winsize," window"))
        #ggsave(paste0("../Output/COV/3Pops.",pop,"_tempCovs_acrossGenome_",winsize[i], "Window.png"), width = 8, height = 2.7, dpi=300) 
        }
    else {
        cov12<-read.csv(paste0("~/Projects/Pacherring_Vincent/MD7000/",pop,"_cov12_1996-1991_2006-1996_3Pops_",winsize,"window.csv"), header = F)
        cov23<-read.csv(paste0("~/Projects/Pacherring_Vincent/MD7000/",pop,"_cov23_2017-2006_2006-1996_3Pops_",winsize,"window.csv"), header = F)
        cov13<-read.csv(paste0("~/Projects/Pacherring_Vincent/MD7000/",pop,"_cov13_2017-2006_1996-1991_3Pops_",winsize,"window.csv"), header = F)
        covs<-cbind(iv, cov12, cov23,cov13)
        colnames(covs)[4:6]<-c("cov12","cov23","cov13")
        covs$index=1:nrow(covs)
    
        covs$color<-"col1"
        covs$color[covs$chrom %in% evens]<-"col2"
    
        covs[sapply(covs, is.infinite)] <- NA
        covs[sapply(covs, is.nan)] <- NA
        
        cov.list[[k]]<-covs
        names(cov.list)[k]<-paste0(pop,"_",winsize)    
        k=k+1
        covsm<-melt(covs[,c("index","color","cov12","cov23","cov13")], id.vars = c("index", "color"))
        ymax<-max(covsm$value, na.rm=T)
        y<-min(covsm$value, na.rm=T)
        ymin<-ifelse (y<=-0.1,-0.1, y) 
        ggplot(covsm, aes(x=index, y=value, color=color))+
            facet_wrap(~variable, nrow=3)+
            geom_point(size=1, alpha=0.5)+
            theme_classic()+
            ylim(ymin,ymax)+
            scale_color_manual(values=c("gray70","steelblue"), guide="none")+
            ylab("Covariance")+xlab('Chromosome')+
            theme(axis.text.x = element_blank())+
            ggtitle(paste0(pop," ", winsize," window"))
        #ggsave(paste0("../Output/COV/3Pops.",pop,"_tempCovs_acrossGenome_",winsize, "Window.png"), width = 8, height = 8, dpi=300)    
    }
}

3 Find the covariance lower cut off values

cv<-c("cov12","cov13","cov23")
cvrange<-data.frame(pop=c(paste0(pops[1:2],"_", cv[1]),paste0(pops[1:2],"_", cv[2]),paste0(pops,"_", cv[3])))
k=1
for (i in 1:length(cv)){
    if (i==1|i==2){
        if (i==1) k=1
        if (i==2) k=3
        #PWS
        df1<-cov.list[[paste0("PWS_100k")]]
        df1<-df1[order(df1[,cv[i]], decreasing=T),]
        n<-ceiling(nrow(df1)*0.01) #top1% region
        df1$top1<-"N"
        df1$top1[1:n]<-"PWS"
        rg<-range(df1[df1$top1=="PWS",cv[i]], na.rm=T)
        cvrange[k,"100k"]<-paste0(rg[1],"-",rg[2])
          
        #tb
        df2<-cov.list[["TB_100k"]]
        df2<-df2[order(df2[,cv[i]], decreasing=T),]
        df2$top1<-"N"
        df2$top1[1:n]<-"TB"
        rg2<-range(df2[df2$top1=="TB", cv[i]], na.rm=T)
        cvrange[(k+1),"100k"]<-paste0(rg2[1],"-",rg2[2])
    }
   
    if (i==3){
        k=5
        #pws
        df1<-cov.list[["PWS_100k"]]
        df1<-df1[,c("chrom","start","end","cov23")]
        df1<-df1[order(df1$cov23, decreasing=T),]
        n<-ceiling(nrow(df1)*0.01) #top1% region
        df1$top1<-"N"
        df1$top1[1:n]<-"PWS"
        
        rg<-range(df1[df1$top1=="PWS",cv[i]], na.rm=T)
        cvrange[k,"100k"]<-paste0(rg[1],"-",rg[2])
           
        #tb
        df2<-cov.list[["TB_100k"]]
        df2<-df2[,c("chrom","start","end","cov23")]
        df2<-df2[order(df2$cov23, decreasing=T),]
        df2$top1<-"N"
        df2$top1[1:n]<-"TB"
        rg2<-range(df2[df2$top1=="TB", cv[i]], na.rm=T)
        cvrange[(k+1),"100k"]<-paste0(rg2[1],"-",rg2[2])
    
        #ss
        df3<-cov.list[["SS_100k"]]
        df3<-df3[,c("chrom","start","end","cov23")]
        df3<-df3[order(df3$cov23, decreasing=T),]
        df3$top1<-"N"
        df3$top1[1:n]<-"SS"
        rg3<-range(df3[df3$top1=="SS", cv[i]], na.rm=T)
        cvrange[(k+2),"100k"]<-paste0(rg3[1],"-",rg3[2])
        }
    }
}

cvs<-melt(cvrange, id.vars = "pop")
cvs<-cvs %>%
  separate(value, c("low", "high"), "-")
cvs$low<-as.numeric(cvs$low)
cvs$high<-as.numeric(cvs$high)
cvs<-cvs%>%
  separate(pop, c("pop", "cov"), "_")

ggplot(cvs, aes(x=cov, y=high, fill=pop))+
    geom_crossbar(aes(ymin=low, ymax=high), width=0.5, position=position_dodge(width = 1))+
    ylab("Range of covariances")+
    theme_light()+xlab("")+
    geom_vline(xintercept=c(1.5,2.5), color="gray")+
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.title=element_blank())+
    ggtitle("Top1% Cov Range")
ggsave("../Output/COV/COVscan_3pop/TempCov_Range_comparison_100k.png", width = 5, height = 3, dpi=300)

ggplot(cvs, aes(x=cov, y=low, color=pop))+
    geom_point()+
    ylab("Lower limit of top 1% covariance")+
    theme_light()+xlab("")+
    geom_vline(xintercept=c(1.5,2.5), color="gray")+
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.title=element_blank())
ggsave("../Output/COV/COVscan_3pop/TempCov_Range_lowLimit_comparison_100k.png", width = 5, height = 3, dpi=300)

3.1 Use the lowest covariance values for each period to definte outlier regions

lows<-aggregate(cvs$low, by=list(cvs$cov), min)
names(lows)<-c("cov","low")
#low cutoff for each time period (100k-window)
#     cov        low
#1 cov12 0.02874841
#2 cov13 0.03102712
#3 cov23 0.03246524


# Outliers based on the new low cut-off values 100k window. 
cov12<-data.frame()
cov23<-data.frame()
cov13<-data.frame()

for (i in 1:length(cov.list)){
 #PWS and TB
  if (i==1|i==2){
    covs<-cov.list[[i]]
    pop<-gsub("_.+",'', names(cov.list)[i])
  
    #outlier cutoff value
    x<-lows$low[lows$cov=="cov12"]
    covs12_top<-subset(covs, cov12>=x)
    covs12_top<-covs12_top[order(covs12_top$chrom, covs12_top$start),]
    covs12_top$pop<-pop
    cov12<-rbind(cov12, covs12_top)
    
    covs<-covs[order(covs$cov13, decreasing=T),]
    x<-lows$low[lows$cov=="cov13"]
    covs13_top<-subset(covs, cov13>=x)
    covs13_top<-covs13_top[order(covs13_top$chrom, covs13_top$start),]
    covs13_top$pop<-pop
    cov13<-rbind(cov13, covs13_top)
    
    covs<-covs[order(covs$cov23, decreasing=T),]
    x<-lows$low[lows$cov=="cov23"]
    covs23_top<-subset(covs[,c("chrom","start","end","cov23","index","color")], cov23>=x)
    covs23_top<-covs23_top[order(covs23_top$chrom, covs23_top$start),]
    covs23_top$pop<-pop
    cov23<-rbind(cov23, covs23_top)
 }
 if (grepl("SS",names(cov.list)[i])){
    covs<-cov.list[[i]]
    
    pop<-gsub("_.+",'', names(cov.list)[i])
    win<-gsub(paste0(pop,"_"), '', names(cov.list)[i])
    
    covs<-covs[order(covs$cov23, decreasing=T),]
    x<-lows$low[lows$cov=="cov23"]
    covs23_top<-subset(covs, cov23>=x)
    covs23_top<-covs23_top[order(covs23_top$chrom, covs23_top$start),]
    covs23_top$pop<-pop
    cov23<-rbind(cov23, covs23_top)
    }
}

write.csv(cov12, "../Output/COV/COVscan_3pop/cutoff/3pops_top1percent_outlier_cutoff.cov12.csv",row.names = F)
write.csv(cov23, "../Output/COV/COVscan_3pop/cutoff/3pops_top1percent_outlier_cutoff.cov23.csv",row.names = F)
write.csv(cov13, "../Output/COV/COVscan_3pop/cutoff/3pops_top1percent_outlier_cutoff.cov13.csv",row.names = F)

3.1.1 Strickter covariance cutoff

#
cov12<-data.frame()
cov23<-data.frame()
cov13<-data.frame()
names(cov.list)
for (i in 1:length(cov.list)){
 #PWS and TB
  if (i==1|i==2){
    covs<-cov.list[[i]]
    pop<-gsub("_.+",'', names(cov.list)[i])
    
    plot(covs$cov12)
  
    #outlier cutoff value
    x<-lows$low[lows$cov=="cov12"]
    covs12_top<-subset(covs, cov12>=x)
    # cov cutoff at 0.035
    c12<-covs12_top[covs12_top$cov12 >0.035,] 
    
    #create a bed file for the reion
    df<-c12[,c("chrom","start","end")]
    #add 100k
    df$start<-df$start-100000
    df$end<-df$end+100000
    dfp<-df[df$pop=="PWS",1:3]
    colnames(dfp)<-c('track type=bedGraph', '1','1')
    write.table(dfp, paste0("../Output/COV/COVscan_3pop/cutoff/PWS_outliers_",cv[i],"_new.bed"),quote = F, row.names = F, col.names = T,sep = "\t")
  
    
    covs12_top<-covs12_top[order(covs12_top$chrom, covs12_top$start),]
    covs12_top$pop<-pop
    cov12<-rbind(cov12, covs12_top)
    
    covs<-covs[order(covs$cov13, decreasing=T),]
    x<-lows$low[lows$cov=="cov13"]
    covs13_top<-subset(covs, cov13>=x)
    covs13_top<-covs13_top[order(covs13_top$chrom, covs13_top$start),]
    covs13_top$pop<-pop
    cov13<-rbind(cov13, covs13_top)
    
    covs<-covs[order(covs$cov23, decreasing=T),]
    x<-lows$low[lows$cov=="cov23"]
    covs23_top<-subset(covs[,c("chrom","start","end","cov23","index","color")], cov23>=x)
    covs23_top<-covs23_top[order(covs23_top$chrom, covs23_top$start),]
    covs23_top$pop<-pop
    cov23<-rbind(cov23, covs23_top)
 }
 if (grepl("SS",names(cov.list)[i])){
    covs<-cov.list[[i]]
    
    pop<-gsub("_.+",'', names(cov.list)[i])
    win<-gsub(paste0(pop,"_"), '', names(cov.list)[i])
    
    covs<-covs[order(covs$cov23, decreasing=T),]
    x<-lows$low[lows$cov=="cov23"]
    covs23_top<-subset(covs, cov23>=x)
    covs23_top<-covs23_top[order(covs23_top$chrom, covs23_top$start),]
    covs23_top$pop<-pop
    cov23<-rbind(cov23, covs23_top)
    }
}

3.2 Create plots with different colors for outliers

#for COV12 and COV13 for TB and PWS (100K)
cv<-c("cov12","cov13","cov23")

for (i in 1:length(cv)){
    if (i==1|i==2){
        #cutoff value
        x<-lows$low[lows$cov==cv[i]]
        #PWS
        df1<-cov.list[["PWS_100k"]]
        df1<-df1[order(df1[,cv[i]], decreasing=T),]
        df1$top1<-"N"
        df1$top1[df1[,cv[i]]>=x]<-"PWS"
        
        #tb
        df2<-cov.list[["TB_100k"]]
        df2<-df2[order(df2[,cv[i]], decreasing=T),]
        df2$top1<-"N"
        df2$top1[df2[,cv[i]]>=x]<-"TB"
        
        #Combine PWS and TB tables
        co<-rbind(df1, df2)
        co$chrom<-factor(co$chrom, levels=paste0("chr", 1:26))
        co$top1<-factor(co$top1, levels=c("PWS","TB","N"))
        colnames(co)[which(colnames(co)==cv[i])]<-"cov"
    
        ymax<-max(co$cov, na.rm=T)
        #Plot each genome separately
        ggplot(co, aes(x=start/1000000, y=cov, color=top1))+
            geom_point(size=0.5)+
            facet_wrap(~chrom, ncol=4)+
            theme_classic()+ylim(-0.1,ymax)+
            scale_color_manual(values=c(paste0(cols[2],"B3"),paste0(cols[1],"B3") ,"#C0C0C080"), labels=c("PWS", "TB", ""))+
            ylab("Covariance")+xlab('Postion (Mb)')+
            ggtitle(cv[i])+
            scale_x_continuous(labels = comma)+
            guides(color = guide_legend(override.aes = list(color=c(cols[2],cols[1],"white"),size=2), title=element_text("Top 1%")))
   
        ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/3Pops.",cv[i],"_perChrom_100k_Window_Outliers.png"), width = 10, height = 8, dpi=300)
        
        #Whole genome in 1 plot 
        #assign colors
        co$top1<-apply(co, 1, function(x) {ifelse (x['top1']=="N", x['color'], x['top1'])} )
        co$top1<-factor(co$top1, levels=c("PWS","TB","col1","col2"))
        
        #count the number of sites per chromosomes
        poss<-data.frame(chr=paste0("chr",1:26))
        k=1
        for (j in 1:26){
            df<-df1[df1$chr==paste0("chr",j),]
            poss$start[j]<-k
            poss$end[j]<-k+nrow(df)-1
            k=k+nrow(df)
        }
        poss$x<-poss$start+(poss$end-poss$start)/2
        ymax<-max(co$cov, na.rm=T)
        ggplot(co, aes(x=index, y=cov, color=top1))+
            geom_point(size=0.5)+
            theme_classic()+ylim(-0.1,ymax)+
            scale_color_manual(values=c(paste0(cols[2],"B3"),paste0(cols[1],"B3"),"#A8BBCD66","#D6D6D666"), labels=c("PWS", "TB", "",""))+
            ylab("Covariance")+
            ggtitle(paste0(" 100k window ",cv[i]))+
            guides(color = guide_legend(override.aes = list(color=c(cols[2], cols[1],"white","white"), size=2), title=element_text("Outlier Region", size=10)))+
            scale_x_continuous(name="Chromosome", breaks=poss$x, labels=1:26)
        ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/3Pops.",cv[i],"_100k_Window_Outliers.png"), width = 10, height = 3.5, dpi=300)
    }
   
    if (i==3){
       #cutoff value
        x<-lows$low[lows$cov==cv[i]]
        #PWS
        df1<-cov.list[["PWS_100k"]]
        df1<-df1[,c("chrom","start","end","cov23","index","color")]
        df1<-df1[order(df1$cov23, decreasing=T),]
        df1$top1<-"N"
        df1$top1[df1[,cv[i]]>=x]<-"PWS"
        
        #tb
        df2<-cov.list[["TB_100k"]]
        df2<-df2[,c("chrom","start","end","cov23","index","color")]
        df2<-df2[order(df2$cov23, decreasing=T),]
        df2$top1<-"N"
        df2$top1[df2[,cv[i]]>=x]<-"TB"
    
        #ss
        df3<-cov.list[["SS_100k"]]
        df3<-df3[,c("chrom","start","end","cov23","index","color")]
        df3<-df3[order(df3$cov23, decreasing=T),]
        df3$top1<-"N"
        df3$top1[df3[,cv[i]]>=x]<-"SS"

        co<-rbind(df1,df2,df3)

        co$chrom<-factor(co$chrom, levels=paste0("chr", 1:26))
        co$top1<-factor(co$top1, levels=c("PWS","TB","SS","N"))
        ymax<-max(co$cov23, na.rm=T)
        ggplot(co, aes(x=start/1000000, y=cov23, color=top1))+
            geom_point(size=0.6)+
            facet_wrap(~chrom, ncol=4)+
            theme_classic()+ylim(-0.1,ymax)+
            ylab("Covariance")+xlab('Postion (Mb)')+
            ggtitle(cv[i])+
            scale_x_continuous(labels = comma)+
            #scale_color_discrete(breaks=c("PWS","SS","TB"))+
            scale_color_manual(values=c(paste0(cols[c(2,1,3)],"B3"),"#C0C0C088"), labels=c("PWS","TB","SS", ""))+
            guides(color = guide_legend(override.aes = list(color=c(cols[c(2,1,3)],"white"), size=2),title=element_text("Top 1% outliers"))) 
        ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/3Pops.cov23_perChrom_100k_Window_Outliers.png"), width = 10, height = 9, dpi=300)
        
        #assign colors
        co$top1<-apply(co, 1, function(x) {ifelse (x['top1']=="N", x['color'], x['top1'])} )
        co$top1<-factor(co$top1, levels=c("PWS","TB","SS","col1","col2"))
        #count the number of sites per chromosomes
        poss<-data.frame(chr=paste0("chr",1:26))
        k=1
        for (j in 1:26){
            df<-df1[df1$chr==paste0("chr",j),]
            poss$start[j]<-k
            poss$end[j]<-k+nrow(df)-1
            k=k+nrow(df)
        }
        poss$x<-poss$start+(poss$end-poss$start)/2
        ymax<-max(co$cov, na.rm=T)
        ggplot(co, aes(x=index, y=cov23, color=top1))+
            geom_point(size=0.5)+
            theme_classic()+ylim(-0.1,ymax)+
            scale_color_manual(values=c(paste0(cols[c(2,1,3)],"B3"),"#A8BBCD66","#D6D6D666"), labels=c("PWS", "TB","SS", "",""))+
                ylab("Covariance")+
                ggtitle(paste0(" 100k window ",cv[i]))+
                guides(color = guide_legend(override.aes = list(color=c(cols[c(2,1,3)],"white","white"), size=2), title=element_text("Outlier (1%)")))+
            scale_x_continuous(name="Chromosome", breaks=poss$x, labels=1:26)+
            theme(legend.title = element_text(size=10))
        ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/3Pops.",cv[i],"_100k_Window_Outliers.png"), width = 10, height = 3.5, dpi=300)
        }
        
}

3.2.1 Whole genome plots all time priods for PWS and TB

## Plot 3 time periods together for PWS and TB
Cov<-data.frame()
for (i in 1:length(cv)){
    #cutoff value
    x<-lows$low[lows$cov==cv[i]]
    #PWS
    df1<-cov.list[["PWS_100k"]]
    df1<-df1[order(df1[,cv[i]], decreasing=T),]
    df1$top1<-"N"
    df1$top1[df1[,cv[i]]>=x]<-"PWS"
    
    #tb
    df2<-cov.list[["TB_100k"]]
    df2<-df2[order(df2[,cv[i]], decreasing=T),]
    df2$top1<-"N"
    df2$top1[df2[,cv[i]]>=x]<-"TB"
    
    #Combine PWS and TB tables
    co<-rbind(df1, df2)
    co$chrom<-factor(co$chrom, levels=paste0("chr", 1:26))
    colnames(co)[which(colnames(co)==cv[i])]<-"cov"
    #assgin colors
    co$top1<-apply(co, 1, function(x) {ifelse (x['top1']=="N", x['color'], x['top1'])} )
    co$top1<-factor(co$top1, levels=c("PWS","TB","col1","col2"))
    co$time<-cv[i]
    
    Cov<-rbind(Cov, co[,c("index", "cov","top1","time")])
}

#count the number of sites per chromosomes
df1<-cov.list[["PWS_100k"]]
poss<-data.frame(chr=paste0("chr",1:26))
k=1
for (j in 1:26){
        df<-df1[df1$chr==paste0("chr",j),]
        poss$start[j]<-k
        poss$end[j]<-k+nrow(df)-1
        k=k+nrow(df)
}
poss$x<-poss$start+(poss$end-poss$start)/2
ymax<-max(co$cov, na.rm=T)
ggplot(Cov, aes(x=index, y=cov, color=top1))+
    facet_wrap(~time, ncol=1)+
    geom_point(size=0.5)+
    theme_classic()+ylim(-0.1,ymax)+
    scale_color_manual(values=c(paste0(cols[c(2,1)],"B3"),"#A8BBCD66","#D6D6D666"), labels=c("PWS", "TB", "",""))+
    ylab("Covariance")+
    guides(color = guide_legend(override.aes = list(color=c(cols[c(2,1)],"white","white"), size=2), title=element_text("Outlier", size=10)))+
    scale_x_continuous(name="Chromosome", breaks=poss$x, labels=1:26)

ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/PWS_TB_100k_Window_Outliers.png"), width = 11, height = 5, dpi=300)
        }}

3.3 Overlapping outlier regions between different populations

#100k
cv<-c("cov12","cov13","cov23")
pairs<-t(combn(pops, 2))
pairs<-data.frame(pairs)
colnames(pairs)<-paste0("pop",1:2)
Ov_direct<-data.frame(cov=c(cv[1:2],"cov23-PT","cov23-PS","cov23-ST" ,"cov23-3"))
Ov_300<-data.frame(cov=c(cv[1:2],"cov23-PT","cov23-PS","cov23-ST" ,"cov23-3"))
for (i in 1:length(cv)){
    df<-read.csv(paste0("../Output/COV/COVscan_3pop/cutoff/3pops_top1percent_outlier_cutoff.", cv[i], ".csv"))
    df$id<-paste0(df$chrom,"_",df$start)
    
    if (i!=3){
        #exact overlaps
        isec<-intersect(df$id[df$pop=="PWS"], df$id[df$pop=="TB"]) 
        Ov_direct$count[i]<-length(isec)
        
        #### Check chromosome region overlap +-200,000 bases
        pop1<-df[df$pop=="PWS",]
        pop2<-df[df$pop=="TB",]
        overlps<-data.frame()
        overlps2<-data.frame()
        for (n in 1: nrow(pop1)){
            re<-pop2[pop2$chrom==pop1$chrom[n],]
            if (nrow(re)>=1){
                for (s in 1: nrow(re)){
                    if (re$start[s]<=pop1$start[n]+200000 & re$start[s]>=pop1$start[n]-200000){
                        overlps<-rbind(overlps, re[s,])
                        overlps2<-rbind(overlps2,pop1[n,])}
                }
            }
        }
        # Merge two tables into one summary overlap table:
        ov<-data.frame(id=overlps$id)
        for (n in 1: nrow(overlps)){
            if (overlps$start[n]<overlps2$start[n]) {ov$start[n]<-overlps$start[n]; ov$end[n]<-overlps2$end[n]}
            if (overlps$start[n]>=overlps2$start[n]) {ov$start[n]<-overlps2$start[n];ov$end[n]<-overlps$end[n]}
        }
        ov[,"cov.PWS"]<-overlps[,4]
        ov[,"cov.TB"]<-overlps2[,4]
        write.csv(ov, paste0("../Output/COV/COVscan_3pop/cutoff/Overlap_regions_",cv[i],"_plusminus100k.csv"), row.names = F)
        Ov_300$count[i]<-nrow(ov)
        }
        
    if (i==3){
        isec<-intersect(df$id[df$pop=="PWS"], df$id[df$pop=="TB"]) 
        isec2<-intersect(df$id[df$pop=="PWS"], df$id[df$pop=="SS"]) 
        isec3<-intersect(df$id[df$pop=="SS"], df$id[df$pop=="TB"]) 
        Ov_direct$count[i]<-length(isec)
        Ov_direct$count[i+1]<-length(isec2)
        Ov_direct$count[i+2]<-length(isec3)
        Ov_direct$count[i+3]<-length(intersect(df$id[df$pop=="SS"], intersect(df$id[df$pop=="PWS"], df$id[df$pop=="TB"])))
        
        for(j in 1:nrow(pairs)){
        #### Check chromosome region overlap +-200,000 bases
            pop1<-df[df$pop==pairs[j,1],]
            pop2<-df[df$pop==pairs[j,2],]
            overlps<-data.frame()
            overlps2<-data.frame()
            for (n in 1: nrow(pop1)){
                re<-pop2[pop2$chrom==pop1$chrom[n],]
                if (nrow(re)>=1){
                    for (s in 1: nrow(re)){
                        if (re$start[s]<=pop1$start[n]+200000 & re$start[s]>=pop1$start[n]-200000){
                            overlps<-rbind(overlps, re[s,])
                            overlps2<-rbind(overlps2,pop1[n,])}
                    }
                }
            }
        # Merge two tables into one summary overlap table:
            ov<-data.frame(id=overlps$id)
            for (n in 1: nrow(overlps)){
                if (overlps$start[n]<overlps2$start[n]) {ov$start[n]<-overlps$start[n]; ov$end[n]<-overlps2$end[n]}
                if (overlps$start[n]>=overlps2$start[n]) {ov$start[n]<-overlps2$start[n];ov$end[n]<-overlps$end[n]}
            }
        
            ov[,paste0("cov.",pairs[j,1])]<-overlps[,4]
            ov[,paste0("cov.",pairs[j,2])]<-overlps2[,4]
            ov<-ov[!duplicated(ov),]
            write.csv(ov, paste0("../Output/COV/COVscan_3pop/cutoff/Overlap_regions_",cv[i],"_",pairs[j,1],".", pairs[j,2],"_plusminus200k.csv"), row.names = F)
            Ov_300$count[i+j-1]<-nrow(ov)
    }
    }
}
write.csv(Ov_direct, paste0("../Output/COV/COVscan_3pop/cutoff/Direct_Overlapping_regions_counts_3pop_summary.csv"))
Ov_300$count[6]<-NA
write.csv(Ov_300, paste0("../Output/COV/COVscan_3pop/cutoff/Overlapping_regions_counts_3pop_plusMinus200k.csv"))

4 Run the snpEff pipeline to find annotation in the outlier regions (100k-window+-100k)

4.1 Create a script to run SnpEff

Create VCF files with selected regions & run snpEff

#Create bed files
cv<-c("cov12","cov13","cov23")
#Prevent scientific notation in bed files
options(scipen=999)

#The first line of bed files is often not red by vcftools
for (i in 1:3){
    df<-read.csv(paste0("../Output/COV/COVscan_3pop/cutoff/3pops_top1percent_outlier_cutoff.", cv[i], ".csv"))
    #add 100k
    df$start<-df$start-100000
    df$end<-df$end+100000
    dfp<-df[df$pop=="PWS",1:3]
    colnames(dfp)<-c('track type=bedGraph', '1','1')
    write.table(dfp, paste0("../Output/COV/COVscan_3pop/cutoff/PWS_outliers_",cv[i],"_new.bed"),quote = F, row.names = F, col.names = T,sep = "\t")
    dft<-df[df$pop=="TB",1:3]
    colnames(dft)<-c('track type=bedGraph', '1','1')
    write.table(dft, paste0("../Output/COV/COVscan_3pop/cutoff/TB_outliers_",cv[i],"_new.bed"),quote = F, row.names = F, col.names = F,sep = "\t")
    
    if (i==3){
        dfs<-df[df$pop=="SS",1:3]
        colnames(dfs)<-c('track type=bedGraph', '1','1')
        write.table(dfs, paste0("../Output/COV/COVscan_3pop/cutoff/SS_outliers_",cv[i],"_new.bed"),quote = F, row.names = F, col.names = F,sep = "\t")
    }
}

# Create a bash script to create vcf files with selected regions
bedfiles<-list.files("../Output/COV/COVscan_3pop/cutoff/", pattern="*_new.bed")

sink("../COVscan_createVCFs_3Pops_cutoff.sh")
cat("#!/bin/bash \n\n")
for (i in 1:length(bedfiles)){
    fname<-gsub(".bed",'', bedfiles[i])
    cat(paste0("vcftools --gzvcf Data/new_vcf/3pop/3pops.MD7000_NS0.5_maf05.vcf.gz --bed Output/COV/COVscan_3pop/cutoff/", bedfiles[i], " --out Output/COV/COVscan_3pop/cutoff/", fname," --recode --keep-INFO-all \n"))
}
sink(NULL)  

4.1.1 Create bed files and scripts to run SnpEff with less padded windows (10k)

Create VCF files with selected regions & run snpEff

#Create bed files
cv<-c("cov12","cov13","cov23")
#Prevent scientific notation in bed files
options(scipen=999)

#The first line of bed files is often not red by vcftools
for (i in 1:3){
    df<-read.csv(paste0("../Output/COV/COVscan_3pop/cutoff/3pops_top1percent_outlier_cutoff.", cv[i], ".csv"))
    #add 10k
    df$start<-df$start-10000
    df$end<-df$end+10000
    dfp<-df[df$pop=="PWS",1:3]
    colnames(dfp)<-c('track type=bedGraph', '1','1')
    write.table(dfp, paste0("../Output/COV/COVscan_3pop/cutoff/PWS_outliers_",cv[i],"_10kpad.bed"),quote = F, row.names = F, col.names = T,sep = "\t")
    dft<-df[df$pop=="TB",1:3]
    colnames(dft)<-c('track type=bedGraph', '1','1')
    write.table(dft, paste0("../Output/COV/COVscan_3pop/cutoff/TB_outliers_",cv[i],"_10kpad.bed"),quote = F, row.names = F, col.names = F,sep = "\t")
    
    if (i==3){
        dfs<-df[df$pop=="SS",1:3]
        colnames(dfs)<-c('track type=bedGraph', '1','1')
        write.table(dfs, paste0("../Output/COV/COVscan_3pop/cutoff/SS_outliers_",cv[i],"_10kpad.bed"),quote = F, row.names = F, col.names = F,sep = "\t")
    }
}

# Create a bash script to create vcf files with selected regions
bedfiles<-list.files("../Output/COV/COVscan_3pop/cutoff/", pattern="*_10kpad.bed")

sink("../COVscan_createVCFs_3Pops_cutoff_10kpad.sh")
cat("#!/bin/bash \n\n")
for (i in 1:length(bedfiles)){
    fname<-gsub(".bed",'', bedfiles[i])
    cat(paste0("vcftools --gzvcf Data/new_vcf/3pop/3pops.MD7000_NS0.5_maf05.vcf.gz --bed Output/COV/COVscan_3pop/cutoff/", bedfiles[i], " --out Output/COV/COVscan_3pop/cutoff/", fname," --recode --keep-INFO-all \n"))
}
sink(NULL)  
#create a bash script to run snpEff
vfiles<-list.files("../Output/COV/COVscan_3pop/cutoff/", pattern=".recode.vcf")

sink("~/programs/snpEff/runsnpEff_cov_3pop_cutoff.sh")
cat("#!/bin/bash \n\n")
for (i in 1:length(vfiles)){
    fname<-gsub("_new.recode.vcf","",vfiles[i])
    cat(paste0("java -Xmx8g -jar snpEff.jar Ch_v2.0.2.99 ~/Projects/PacHerring/Output/COV/COVscan_3pop/cutoff/",vfiles[i], " -stats ~/Projects/PacHerring/Output/COV/COVscan_3pop/cutoff/",fname,".html >  ~/Projects/PacHerring/Output/COV/COVscan_3pop/cutoff/Anno.",fname,".vcf \n"))
    
    #extract the annotation information
    cat(paste0("bcftools query -f '%CHROM %POS %INFO/AF %INFO/ANN\\n' ~/Projects/PacHerring/Output/COV/COVscan_3pop/cutoff/Anno.",fname,".vcf > ~/Projects/PacHerring/Output/COV/COVscan_3pop/cutoff/",fname,"_annotation \n\n"))

}
sink(NULL)  


## for 10k pad vcf

#create a bash script to run snpEff
vfiles<-list.files("../Output/COV/COVscan_3pop/cutoff/", pattern="10kpad.recode.vcf")

sink("~/programs/snpEff/runsnpEff_cov_3pop_cutoff_10kpad.sh")
cat("#!/bin/bash \n\n")
for (i in 1:length(vfiles)){
    fname<-gsub(".recode.vcf","",vfiles[i])
    cat(paste0("java -Xmx8g -jar snpEff.jar Ch_v2.0.2.99 ~/Projects/PacHerring/Output/COV/COVscan_3pop/cutoff/",vfiles[i], " -stats ~/Projects/PacHerring/Output/COV/COVscan_3pop/cutoff/",fname,".html >  ~/Projects/PacHerring/Output/COV/COVscan_3pop/cutoff/Anno.",fname,".vcf \n"))
    
    #extract the annotation information
    cat(paste0("bcftools query -f '%CHROM %POS %INFO/AF %INFO/ANN\\n' ~/Projects/PacHerring/Output/COV/COVscan_3pop/cutoff/Anno.",fname,".vcf > ~/Projects/PacHerring/Output/COV/COVscan_3pop/cutoff/",fname,"_annotation \n\n"))

}
sink(NULL)  

4.2 Create summary gene files from snpEff and check overlapping genes.

## Create summary files of snpEff results (gene annotations in the regions of interest) and reformat as a ShinyGo input 

#create gene list 
gfiles<-list.files("../Output/COV/COVscan_3pop/cutoff/", pattern="genes.txt")

for (i in 1:length(gfiles)){
    df<-read.table(paste0("../Output/COV/COVscan_3pop/cutoff/",gfiles[i]), sep="\t")
    df<-df[,1:7]
    colnames(df)<-c("GeneName","GeneId","TranscriptId","BioType","variants_impact_HIGH","variants_impact_LOW",  "variants_impact_MODERATE")
    
    fname<-gsub(".genes.txt","",gfiles[i])
    genes<-unique(df$GeneId)
    sink(paste0("../Output/COV/COVscan_3pop/cutoff/geneIDlist_",fname,".txt"))
    cat(paste0(genes,"; "))
    sink(NULL)
}

#Annotation infor from SnpEff
cv<-c("cov12","cov13","cov23")
for (c in 1:3){
    if (c!=3){
    for (p in 1:2){
        ano<-read.table(paste0("../Output/COV/COVscan_3pop/cutoff/",pops[p],"_outliers_",cv[c],"_annotation"), header = F)
        annotations<-data.frame()
        for (i in 1: nrow(ano)){
            anns<-unlist(strsplit(ano$V4[i], "\\,|\\|"))
            annm<-data.frame(matrix(anns,ncol = 16, byrow = TRUE))
            annm<-annm[,c(2,3,4,5,8)]
            colnames(annm)<-c("Effect","Putative_impact","Gene_name","Gene_ID","Feature type")
            annm<-annm[!duplicated(annm), ]
            annm$chr<-ano$V1[i]
            annm$pos<-ano$V2[i]
            annm$AF<- ano$V3[i]
            annotations<-rbind(annotations, annm)
        }     
        annotations<-annotations[,c(6:8,1:5)]
        annotations<-annotations[!duplicated(annotations[,1:2]),]
        write.csv(annotations, paste0("../Output/COV/COVscan_3pop/cutoff/Genes_",pops[p],"_outliers_100k_",cv[c],".csv"), row.names = F)
    }
    }
    if (c==3){
        for (p in 1:3){
        ano<-read.table(paste0("../Output/COV/COVscan_3pop/cutoff/",pops[p],"_outliers_",cv[c],"_annotation"), header = F)
        annotations<-data.frame()
        for (i in 1: nrow(ano)){
            anns<-unlist(strsplit(ano$V4[i], "\\,|\\|"))
            annm<-data.frame(matrix(anns,ncol = 16, byrow = TRUE))
            annm<-annm[,c(2,3,4,5,8)]
            colnames(annm)<-c("Effect","Putative_impact","Gene_name","Gene_ID","Feature type")
            annm<-annm[!duplicated(annm), ]
            annm$chr<-ano$V1[i]
            annm$chr<-ano$V1[i]
            annm$pos<-ano$V2[i]
            annm$AF<- ano$V3[i]
            annotations<-rbind(annotations, annm)
        }     
        annotations<-annotations[,c(6:8,1:5)]
        annotations<-annotations[!duplicated(annotations[,1:2]),]
        write.csv(annotations, paste0("../Output/COV/COVscan_3pop/cutoff/Genes_",pops[p],"_outliers_100k_",cv[c],".csv"), row.names = F)
    }
}
  
}

4.2.1 for 10k pad data

## Create summary files of snpEff results (gene annotations in the regions of interest) and reformat as a ShinyGo input 

#create gene list 
gfiles<-list.files("../Output/COV/COVscan_3pop/cutoff/", pattern="10kpad.genes.txt")
for (i in 1:length(gfiles)){
    df<-read.table(paste0("../Output/COV/COVscan_3pop/cutoff/",gfiles[i]), sep="\t")
    df<-df[,1:7]
    colnames(df)<-c("GeneName","GeneId","TranscriptId","BioType","variants_impact_HIGH","variants_impact_LOW",  "variants_impact_MODERATE")
    
    fname<-gsub(".genes.txt","",gfiles[i])
    genes<-unique(df$GeneId)
    sink(paste0("../Output/COV/COVscan_3pop/cutoff/geneIDlist_",fname,".txt"))
    cat(paste0(genes,"; "))
    sink(NULL)
}


### no enrichment found from 10k pad (PWS-cov12)

4.3 Find the overlapping gene names

gnamesfiles<-list.files("../Output/COV/COVscan_3pop/cutoff/", pattern="Genes_.+outliers_100k.+\\d.csv$")

for (i in 1:length(gnamesfiles)){
    df<-read.csv(paste0("../Output/COV/COVscan_3pop/cutoff/",gnamesfiles[i]))
    df<-df[,c(1,6:7)]
    df<-df[!duplicated(df),]
    
    fname<-gsub(".csv","", gnamesfiles[i])
    fname<-gsub("Genes_","", fname)
    
    
    #add gene names for front and back of intergenic regions
    df2<-df[grep("-", df$Gene_ID),]
    k=1
    df_div<-data.frame()
    oddnames<-data.frame()
    for (j in 1:nrow(df2)){
        names<-unlist(strsplit(df2$Gene_name[j], "-"))
        ids<-unlist(strsplit(df2$Gene_ID[j], "-"))
        
        if (length(names)==2){
            df_div<-rbind(df_div, c(df2$chr[j],names[1],ids[1]))
            k=k+1
            df_div<-rbind(df_div, c(df2$chr[j],names[2],ids[2]))
            k=k+1
        }
       
        if (length(names)!=2){
            n<-grep("si:", names)
            if (length(n)>0){
                if (n==1) newnames<-c(paste0(names[1],"-",names[2]), names[3])
                if (n==2) newnames<-c(names[1],paste0(names[2],"-",names[3]))
                df_div<-rbind(df_div, c(df2$chr[j],newnames[1],ids[1]))
                k=k+1
                df_div<-rbind(df_div, c(df2$chr[j],newnames[2],ids[2]))
                k=k+1
            }
            
            if (length(n)==0) {
                oddnames<-rbind(oddnames, df2[j,])
            }
        }
    }
    df_div<-df_div[!duplicated(df_div),]
    df_div<-df_div[df_div$Gene_ID!="CHR_END",]
    df_div<-df_div[df_div$Gene_ID!="CHR_START",]
    
    remove<-grep("-", df$Gene_ID)
    df<-df[-remove,]
    df<-rbind(df, df_div)
    df<-df[!duplicated(df),]
    
    if (nrow(oddnames)!=0){
        write.csv(df, paste0("../Output/COV/COVscan_3pop/cutoff/",fname,"GeneList_withIntergenicGenes.csv" ), row.names = F)
        write.csv(oddnames, paste0("../Output/COV/COVscan_3pop/cutoff/Oddnames_", fname,".csv"))
    }
    if (nrow(oddnames)==0){
        write.csv(df, paste0("../Output/COV/COVscan_3pop/cutoff/",fname,"GeneList_withIntergenicGenes_new.csv" ), row.names = F)
     }
}
   

## !! ##
## Manually change the oddnames and add them to the GeneList (XX.GeneList_withIntergenicGenes.csv) files 
# Add the gene IDs to geneIDlist_XX_outliers_covxx.txt files as well -only one needs to be updated is TB cov13
#(updated file names has "_new" at the end)

#aggregate all gene names
gnew<-list.files("../Output/COV/COVscan_3pop/", pattern="GeneList_withIntergenicGenes_new.csv$")
Genes<-data.frame()
GeneList<-list()
for (i in 1:length(gnew)){
    df<-read.csv(paste0("../Output/COV/COVscan_3pop/", gnew[i]))
    GeneList[[i]]<-df
    fname<-gsub("GeneList_withIntergenicGenes_new.csv",'',gnew[i])
    names(GeneList)[i]<-fname
    dup<-df[duplicated(df),]
    df<-df[!duplicated(df),]
    Genes<-rbind(Genes, df)
    Genes<-Genes[!duplicated(Genes),]
    
}


#1. Between populations
times<-c("cov12","cov13","cov23")
common<-list()
common_summary<-data.frame(time=times)
for (i in 1:3){
    tlist<-GeneList[grep(times[i], names(GeneList))]
    if (i !=3){
        common_genes<-intersect(tlist[[1]]["Gene_name"], tlist[[2]]["Gene_name"])
        common[[i]]<-common_genes
        names(common)[[i]]<-times[i]
        common_summary$PWS[i]<-nrow(tlist[[grep("PWS", names(tlist))]])
        common_summary$TB[i]<-nrow(tlist[[grep("TB", names(tlist))]])
        common_summary$SS[i]<-NA
        common_summary$common_PWS.TB[i]<-nrow(common_genes)
        
        pws<-tlist[[1]]["Gene_name"]
        tb<-tlist[[2]]["Gene_name"]
        x<-list(PWS=pws$Gene_name,TB=tb$Gene_name)
        ggvenn(x, fill_color = cols[c(2,1)], stroke_size = 0.5, set_name_size = 4,text_size=3)+ggtitle(times[i])
        ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/Venn_",times[i],".png"), width = 3, height=3, dpi=300)
    }
    if (i==3){
        common_summary$PWS[i]<-nrow(tlist[[grep("PWS", names(tlist))]])
        common_summary$TB[i]<- nrow(tlist[[grep("TB", names(tlist))]])
        common_summary$SS[i]<- nrow(tlist[[grep("SS", names(tlist))]])
        
        genes1<-intersect(tlist[[1]]["Gene_name"], tlist[[3]]["Gene_name"])
        genes2<-intersect(tlist[[1]]["Gene_name"], tlist[[2]]["Gene_name"])
        genes3<-intersect(tlist[[2]]["Gene_name"], tlist[[3]]["Gene_name"])
        genes4<-intersect(tlist[[1]]["Gene_name"],intersect(tlist[[2]]["Gene_name"], tlist[[3]]["Gene_name"]))
        common_summary$common_PWS.TB[i]<-nrow(genes1)
        common_summary$common_PWS.SS[i]<-nrow(genes2)
        common_summary$common_SS.TB[i]<-nrow(genes3)
        common_summary$common3[i]<-nrow(genes4)
        k=i
        common[[k]]<-genes2
        names(common)[[k]]<-paste0(times[i],"_PWS.SS")
        k=k+1
        common[[k]]<-genes1
        names(common)[[k]]<-paste0(times[i],"_PWS.TB")
        k=k+1
        common[[k]]<-genes3
        names(common)[[k]]<-paste0(times[i],"_SS.TB")
        k=k+1
        common[[k]]<-genes4
        names(common)[[k]]<-paste0(times[i],"_3pops")
        
        pws<-tlist[[1]]["Gene_name"]
        tb<-tlist[[3]]["Gene_name"]
        ss<-tlist[[2]]["Gene_name"]
        x<-list(PWS=pws$Gene_name,TB=tb$Gene_name, SS=ss$Gene_name)
        ggvenn(x, fill_color = cols[c(2,1,3)], stroke_size = 0.5, set_name_size = 4,text_size=3)+ggtitle(times[i])
        ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/Venn_",times[i],".png"), width = 4, height=4, dpi=300)
        
         x1<-list(PWS=pws$Gene_name,TB=tb$Gene_name)
        ggvenn(x1, fill_color = cols[c(2,1)], stroke_size = 0.5, set_name_size = 4,text_size=3)+ggtitle(times[i])
        ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/Venn_PWS_TB_",times[i],".png"), width = 3, height=3, dpi=300)
         x2<-list(PWS=pws$Gene_name,SS=ss$Gene_name)
        ggvenn(x2, fill_color = cols[c(2,3)], stroke_size = 0.5, set_name_size = 4,text_size=3)+ggtitle(times[i])
        ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/Venn_PWS_SS_",times[i],".png"), width = 3, height=3, dpi=300)
          x3<-list(SS=ss$Gene_name, TB=tb$Gene_name)
        ggvenn(x3, fill_color = cols[c(3,1)], stroke_size = 0.5, set_name_size = 4,text_size=3)+ggtitle(times[i])
        ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/Venn_SS_TB_",times[i],".png"), width = 3, height=3, dpi=300)
        
        
        }
}
write.csv(common_summary, "../Output/COV/COVscan_3pop/cutoff/Common_genes_withIntergenes_3pops.csv")


#What are the overlapping gene names between populations
common_times<-list()
for (i in 1: length(common)){
    gids<-common[[i]]
    df<-data.frame(Gene_name=gids)
    
    df2<-merge(df, Genes, by="Gene_name")
    write.csv(df2, paste0("../Output/COV/COVscan_3pop/cutoff/Common_genes_", names(common)[i],".csv"), row.names = F)
    common_times[[i]]<-df2
    names(common_times)[i]<- names(common)[i]
}


#overlapping genes COV12

tlist<-GeneList[grep(times[1], names(GeneList))]
genes1<-intersect(tlist[[1]]["Gene_ID"], tlist[[2]]["Gene_ID"]) #common genes between PWS and TB in cov12
g1<-unique(genes1$Gene_ID)
sink("../Output/COV/COVscan_3pop/cutoff/overlapping_genes/geneIDs_PWS_TB_cov12.txt")
cat(paste0(g1, ";"))
sink(NULL)

# Overlapping genes COV13

tlist<-GeneList[grep(times[2], names(GeneList))]
genes1<-intersect(tlist[[1]]["Gene_ID"], tlist[[2]]["Gene_ID"]) #common genes between PWS and TB in cov12
g1<-unique(genes1$Gene_ID)
sink("../Output/COV/COVscan_3pop/cutoff/overlapping_genes/geneIDs_PWS_TB_cov13.txt")
cat(paste0(g1, ";"))
sink(NULL)




#overlapping genes COV23

tlist<-GeneList[grep(times[3], names(GeneList))]
pws23<-tlist[[1]]["Gene_ID"]
ss23<-tlist[[2]]["Gene_ID"]
tb23<-tlist[[3]]["Gene_ID"]
genes1<-intersect(tlist[[1]]["Gene_ID"], tlist[[3]]["Gene_ID"]) #common genes between PWS and TB in cov23
genes2<-intersect(tlist[[1]]["Gene_ID"], tlist[[2]]["Gene_ID"]) #common genes between PWS and SS in cov23
genes3<-intersect(tlist[[2]]["Gene_ID"], tlist[[3]]["Gene_ID"]) #common genes between SS and TB in cov23
genes4<-intersect(tlist[[1]]["Gene_ID"],intersect(tlist[[2]]["Gene_ID"], tlist[[3]]["Gene_ID"])) # Common genes in all 3 populations
g1<-unique(genes1$Gene_ID)
sink("../Output/COV/COVscan_3pop/cutoff/overlapping_genes/geneIDs_PWS_TB_cov23.txt")
cat(paste0(g1, ";"))
sink(NULL)

g2<-unique(genes2$Gene_ID)
sink("../Output/COV/COVscan_3pop/cutoff/overlapping_genes/geneIDs_PWS_SS_cov23.txt")
cat(paste0(g2, ";"))
sink(NULL)

g3<-unique(genes3$Gene_ID)
sink("../Output/COV/COVscan_3pop/cutoff/overlapping_genes/geneIDs_SS_TB_cov23.txt")
cat(paste0(g3, ";"))
sink(NULL)

g4<-unique(genes4$Gene_ID)
sink("../Output/COV/COVscan_3pop/cutoff/overlapping_genes/geneIDs_all3_cov23.txt")
cat(paste0(g4, ";"))
sink(NULL)



#2. Between Time-Points within a population

times<-c("cov12","cov13","cov23")
pops<-c("PWS","TB")
common2<-list()
common_summary2<-data.frame(pop=rep(pops[1:2], each=4))
for (i in 1:length(pops)){
    plist<-GeneList[grep(pops[i], names(GeneList))]
    k=4*i-3
    #common genes between COV12 and COV13
    common_genes1<-intersect(plist[[1]]["Gene_name"], plist[[2]]["Gene_name"])
    common2[[k]]<-common_genes1
    names(common2)[[k]]<-paste0(pops[i],".", times[1],"_",times[2])
    common_summary2$Time[k]<-paste0(times[1],"_",times[2])
    common_summary2$no.of.genes[k]<-nrow(common_genes1) 
    
    c12<-plist[[1]]["Gene_name"]
    c13<-plist[[2]]["Gene_name"]
    c23<-plist[[3]]["Gene_name"]
    x<-list(COV12=c12$Gene_name,COV13=c13$Gene_name, COV23=c23$Gene_name)
    ggvenn(x, fill_color = cols[c(1,5,7)], stroke_size = 0.5, set_name_size = 4,text_size=3)+ggtitle(pops[i])
    ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/Venn_",pops[i],".png"), width = 4, height=4, dpi=300)

    
    k=k+1
    #common genes between COV12 and COV23
    common_genes2<-intersect(plist[[1]]["Gene_name"], plist[[3]]["Gene_name"])
    common2[[k]]<-common_genes2
    names(common2)[[k]]<-paste0(pops[i],".", times[1],"_",times[3])
    common_summary2$Time[k]<-paste0(times[1],"_",times[3])
    common_summary2$no.of.genes[k]<-nrow(common_genes2) 
 
    k=k+1
    #common genes between COV13 and COV23
    common_genes3<-intersect(plist[[2]]["Gene_name"], plist[[3]]["Gene_name"])
    common2[[k]]<-common_genes3
    names(common2)[[k]]<-paste0(pops[i],".", times[2],"_",times[3])
    common_summary2$Time[k]<-paste0(times[2],"_",times[3])
    common_summary2$no.of.genes[k]<-nrow(common_genes3) 
 
    k=k+1
    #common genes among all time periods
    common_genes4<-intersect(plist[[1]]["Gene_name"], (intersect(plist[[2]]["Gene_name"], plist[[3]]["Gene_name"])))
    common2[[k]]<-common_genes4
    names(common2)[[k]]<-paste0(pops[i],".all")
    common_summary2$Time[k]<-"All"
    common_summary2$no.of.genes[k]<-nrow(common_genes4) 
}
write.csv(common_summary2, "../Output/COV/COVscan_3pop/cutoff/Common_genes_betweenTimePoints.csv")


#Common gene names between time points

for (i in 1:2){
    CommonGenes<-data.frame()
    glist<-common2[grep(pops[i], names(common2))]
    for(j in 1:length(glist)){
        gids<-glist[[j]]
        df<-data.frame(Gene_name=gids)
        df2<-merge(df, Genes, by="Gene_name", all.x=T)
        write.csv(df2, paste0("../Output/COV/COVscan_3pop/cutoff/Common_genes_", names(glist)[j],".csv"), row.names = F)
        df2$Time<-names(glist)[j]
        CommonGenes<-rbind(CommonGenes, df2)
    }
    write.csv(CommonGenes, paste0("../Output/COV/COVscan_3pop/cutoff/Common_genes_",pops[i] ,".csv"), row.names = F)
}

4.3.0.1 Overlapping gene numbers


# Summary table
common_genes<-read.csv("../Output/COV/COVscan_3pop/cutoff/Common_genes_withIntergenes_3pops.csv", row.names = 1)
knitr::kable(common_genes)

4.4 What are the genes overlapping across different time points between populations?

## Between PWS and TB
pws.tb<-common_times[c(1,2,4)]

# 1. Common genes between populations across time points in PWS and TB (COV12 - COV13)
genes1213<-intersect(pws.tb[[1]]["Gene_name"], pws.tb[[2]]["Gene_name"])
genes1213<-merge(genes1213, Genes, by="Gene_name")
write.csv(genes1213, "../Output/COV/COVscan_3pop/cutoff/Common_genes_PWS.TB.cov12-cov23.csv")
#           Gene_name   chr            Gene_ID
#1 ENSCHAG00000001687 chr13 ENSCHAG00000001687
#2             ndst2a chr13 ENSCHAG00000002649
#3             zswim8 chr13 ENSCHAG00000005956

#common gene names
p1213<-pws.tb[[1]]["Gene_name"]
t1213<-pws.tb[[2]]["Gene_name"]
x<-list(PWS=p1213$Gene_name,TB=t1213$Gene_name)
ggvenn(x, fill_color = cols[c(2,1)], stroke_size = 0.5, set_name_size = 4,text_size=3)+ggtitle("COV12-COV13 in PWS & TB")
ggsave(paste0("../Output/COV/COVscan_3pop/Venn_PWS_TB_COV12-COV13.png"), width = 3, height=3, dpi=300)
        

# 2. Common genes between populations across time points in PWS and TB (COV12 - COV23)

genes1223<-intersect(pws.tb[[1]]["Gene_name"], pws.tb[[3]]["Gene_name"])
genes1223<-merge(genes1223, Genes, by="Gene_name")
write.csv(genes1223, "../Output/COV/COVscan_3pop/cutoff/Common_genes_PWS.TB.cov12-cov13.csv")
#           Gene_name   chr            Gene_ID
#1 ENSCHAG00000001687 chr13 ENSCHAG00000001687
#2 ENSCHAG00000022709 chr20 ENSCHAG00000022709
#3 ENSCHAG00000022815 chr20 ENSCHAG00000022815
#4             ndst2a chr13 ENSCHAG00000002649

p1223<-pws.tb[[1]]["Gene_name"]
t1223<-pws.tb[[3]]["Gene_name"]
x<-list(PWS=p1223$Gene_name,TB=t1223$Gene_name)
ggvenn(x, fill_color = cols[c(2,1)], stroke_size = 0.5, set_name_size = 4,text_size=3)+ggtitle("COV12-COV23 in PWS & TB")
ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/Venn_PWS_TB_COV12-COV23.png"), width = 3, height=3, dpi=300)


# 3. between PWS and TB across COV13 and COV23
genes1323<-intersect(pws.tb[[2]]["Gene_name"], pws.tb[[3]]["Gene_name"])
genes1323<-merge(genes1323, Genes, by="Gene_name")
write.csv(genes1323, "../Output/COV/COVscan_3pop/cutoff/ommon_genes_PWS.TB.cov13-cov23.csv")
#           Gene_name   chr            Gene_ID
#1 ENSCHAG00000001687 chr13 ENSCHAG00000001687
#2             ndst2a chr13 ENSCHAG00000002649

p1323<-pws.tb[[2]]["Gene_name"]
t1323<-pws.tb[[3]]["Gene_name"]
x<-list(PWS=p1323$Gene_name,TB=t1323$Gene_name)
ggvenn(x, fill_color = cols[c(2,1)], stroke_size = 0.5, set_name_size = 4,text_size=3)+ggtitle("COV13-COV23 in PWS & TB")
ggsave("../Output/COV/COVscan_3pop/cutoff/Venn_PWS_TB_COV13-COV23.png", width = 3, height=3, dpi=300)
  • Numbers of overlapping genes between populations between time points

5 Interpopulation comparison per time period

### Interpopulation comparisons
#decode the samples to create the right matrix
cv<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_Covs_interPopulations_100k_3pops.csv", header = F)
labs<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_Covs_interPopulations_100k_labels_3pops.csv" )
labs<-labs[,-1]
cvm<-data.frame(label=as.vector(t(labs)), cov=as.vector(t(cv)))

#rearrange based on comparions: covariance between populations within the same period
#PopYr Symbols
# PH 1 'PWS', 1991
# PH 2 'PWS', 1996
# PH 3 'PWS', 2006
# PH 4 'PWS', 2017
# PH 5 'SS',  1991
# PH 6 'SS',  1996
# PH 7 'SS',  2006
# PH 8 'SS',  2017
# PH 9 'TB',  1991
# PH 10'TB',  1996
# PH 11'TB',  2006
# PH 12'TB',  2017

Covs<-data.frame(pops=rep(c("PWS.vs.SS", "PWS.vs.TB",  "SS.vs.TB"), times=6),
                 period=c(rep("1991-1996", times=3),rep("1996-2006", times=3), rep("2006-2017", times=3)))

Covs$cov<-c(NA, cvm$cov[cvm$label=="cov(PH: 2-1, PH: 10-9)"],NA,
            cvm$cov[cvm$label=="cov(PH: 3-2, PH: 7-6)"],cvm$cov[cvm$label=="cov(PH: 3-2, PH: 11-10)"], 
            cvm$cov[cvm$label=="cov(PH: 7-6, PH: 11-10)"],
            cvm$cov[cvm$label=="cov(PH: 4-3, PH: 8-7)"],cvm$cov[cvm$label=="cov(PH: 4-3, PH: 12-11)"],cvm$cov[cvm$label=="cov(PH: 8-7, PH: 12-11)"])



#C.I.
cis<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_COV_Interpop_comparison_CIs.csv")
cis<-cis[,-1]
cim<-data.frame(label=as.vector(t(labs)), ci_l=as.vector(t(cis[1:11,])))
cim$ci_h<-as.vector(t(cis[12:22,]))

Covs$ci_l<-as.numeric(c(NA,cim$ci_l[cim$label=="cov(PH: 2-1, PH: 10-9)"],NA,
                      cim$ci_l[cim$label=="cov(PH: 3-2, PH: 7-6)"],cim$ci_l[cim$label=="cov(PH: 3-2, PH: 11-10)"], cim$ci_l[cim$label=="cov(PH: 7-6, PH: 11-10)"],
                      cim$ci_l[cim$label=="cov(PH: 4-3, PH: 8-7)"],cim$ci_l[cim$label=="cov(PH: 4-3, PH: 12-11)"], cim$ci_l[cim$label=="cov(PH: 8-7, PH: 12-11)"]))

Covs$ci_h<-as.numeric(c(NA, cim$ci_h[cim$label=="cov(PH: 2-1, PH: 10-9)"],NA,
                      cim$ci_h[cim$label=="cov(PH: 3-2, PH: 7-6)"],cim$ci_h[cim$label=="cov(PH: 3-2, PH: 11-10)"], cim$ci_h[cim$label=="cov(PH: 7-6, PH: 11-10)"],
                      cim$ci_h[cim$label=="cov(PH: 4-3, PH: 8-7)"],cim$ci_h[cim$label=="cov(PH: 4-3, PH: 12-11)"], cim$ci_h[cim$label=="cov(PH: 8-7, PH: 12-11)"]))


library(RColorBrewer)
display.brewer.all(type="qual")

colors2<-brewer.pal(n=8, "Set3")
"#8DD3C7" "#FFFFB3" "#BEBADA" "#FB8072" "#80B1D3" "#FDB462" "#B3DE69" "#FCCDE5"

#Barplot
ggplot(Covs, aes(x=period, y=cov, fill=pops))+
    geom_bar(stat="identity",position=position_dodge(width = 0.7), width=0.8)+
    ylab("Covariance")+xlab('')+theme_classic()+
    geom_hline(yintercept = 0,color="gray70", size=0.3)+
    scale_fill_manual(values=colors2[c(4,1,3)])+
    theme(legend.title = element_blank())+
    scale_y_continuous(labels = comma)+
    ylim(-0.0013, 0.002)+
    geom_vline(xintercept = c(1.5,2.5), color="gray", size=0.2)+
    geom_errorbar(aes(ymin=ci_l, ymax=ci_h), width=.2, size=.2, position=position_dodge(width = 0.7))
ggsave("../Output/COV/Interpop_cov_comparison_3Pops_new.png",width = 4.8, height = 3, dpi=300)

#Point plot
ggplot(Covs, aes(x=period, y=cov, color=pops))+
    geom_point(position=position_dodge(width = 0.7), size=3)+
    ylab("Covariance")+xlab('')+theme_classic()+
    geom_hline(yintercept = 0,color="gray70", size=0.3)+
    scale_color_manual(values=colors2[c(4,1,3)])+
    theme(legend.title = element_blank())+
    scale_y_continuous(labels = comma)+
    geom_errorbar(aes(ymin=ci_l, ymax=ci_h), width=.2, size=.2, position=position_dodge(width = 0.7))+
    ylim(-0.0023, 0.002)+
     geom_vline(xintercept = c(1.5,2.5), color="gray", size=0.3)
ggsave("../Output/COV/Interpop_cov_comparison_3Pops_new_PointPlot.png",width = 4.8, height = 3, dpi=300)



#line plot
Covs$time<-1
Covs$time[Covs$period=="1996-2006"]<-2
Covs$time[Covs$period=="2006-2017"]<-3
Covs<-Covs[order(Covs$time),]
ggplot(Covs, aes(x=time, y=cov, color=pops, group=pops))+
    geom_point(position=position_dodge(width = 0.7), size=4)+
    geom_path(position=position_dodge(width = 0.7))+
    ylab("Covariance")+xlab('')+theme_classic()+
    geom_hline(yintercept = 0,color="gray70", size=0.3)+
    scale_color_manual(values=colors2[c(4,1,3)])+
    theme(legend.title = element_blank())+
    scale_y_continuous(labels = comma)+
    geom_errorbar(aes(ymin=ci_l, ymax=ci_h), width=.2, size=.2, position=position_dodge(width = 0.7))+
    ylim(-0.0023, 0.002)+
     geom_vline(xintercept = c(1.5,2.5), color="gray", size=0.2)+
    scale_x_continuous(breaks=c(1,2,3), labels = c("1991-1996","1996-2006","2006-2017"))
ggsave("../Output/COV/Interpop_cov_comparison_3Pops_new_LinePlot.png",width = 4.8, height = 3, dpi=300)

5.1 Longer time period

## Longer time-period
Covs2<-data.frame(pops=rep(c("PWS.vs.SS", "PWS.vs.TB",  "SS.vs.TB"), times=3),
                 period=c(rep("1991-2006", times=3),rep("1991-2017", times=3),rep("1996-2017", times=3)))

cv1<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_Covs_interPopulations_100k_1991-2006.csv", header = F)
labs1<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_Covs_interPopulations_100k_labels_1991-2006.csv" )
labs1<-labs1[,-1]
cvm1<-data.frame(label=as.vector(t(labs1)), cov=as.vector(t(cv1)))

cv2<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_Covs_interPopulations_100k_1991-2017.csv", header = F)
labs2<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_Covs_interPopulations_100k_labels_1991-2017.csv" )
labs2<-labs2[,-1]
cvm2<-data.frame(label=as.vector(t(labs2)), cov=as.vector(t(cv2)))

cv3<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_Covs_interPopulations_100k_1996-2017.csv", header = F)
labs3<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_Covs_interPopulations_100k_labels_1996-2017.csv" )
labs3<-labs3[,-1]
cvm3<-data.frame(label=as.vector(t(labs3)), cov=as.vector(t(cv3)))

Covs2$cov<-c(NA, cvm1$cov[cvm1$label=="cov(PH: 2-1, PH: 4-3)"], NA,
             NA, cvm2$cov[cvm2$label=="cov(PH: 2-1, PH: 4-3)"], NA,
             cvm3$cov[cvm3$label=="cov(PH: 2-1, PH: 4-3)"], cvm3$cov[cvm3$label=="cov(PH: 2-1, PH: 6-5)"], cvm3$cov[cvm3$label=="cov(PH: 4-3, PH: 6-5)"])

#C.I.
cis1<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_COV_Interpop_comparison_CIs_1991-2006.csv")
cis1<-cis1[,-1]
cis2<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_COV_Interpop_comparison_CIs_1991-2017.csv")
cis2<-cis2[,-1]
cis3<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_COV_Interpop_comparison_CIs_1996-2017.csv")
cis3<-cis3[,-1]

#cim<-data.frame(label=as.vector(t(labs)), ci_l=as.vector(t(cis1[1:4,])))
#cim$ci_h<-as.vector(t(cis[12:22,]))

Covs2$ci_l<-as.numeric(c(NA,cis1[1,3],NA,
                        NA,cis2[1,3],NA,
                      cis3[1,3],cis3[1,5],cis3[3,5]))

Covs2$ci_h<-as.numeric(c(NA,cis1[4,3],NA,
                        NA,cis2[4,3],NA,
                      cis3[6,3],cis3[6,5],cis3[8,5]))


ggplot(Covs2, aes(x=period, y=cov, fill=pops))+
    geom_bar(stat="identity",position=position_dodge(width = 0.7), width=0.8)+
    ylab("Covariance")+xlab('')+theme_classic()+
    geom_hline(yintercept = 0,color="gray70", size=0.3)+
    scale_fill_manual(values=colors2[c(4,1,3)])+
    theme(legend.title = element_blank())+scale_y_continuous(labels = comma)+
    ylim(-0.0013, 0.002)+geom_vline(xintercept = c(1.5,2.5), color="gray", size=0.2)+
    geom_errorbar(aes(ymin=ci_l, ymax=ci_h), width=.2, size=.2, position=position_dodge(width = 0.7))
ggsave("../Output/COV/Interpop_cov_comparison_3Pops_LonogerPeriod.png",width = 4.9, height = 3, dpi=300)

ggplot(Covs2, aes(x=period, y=cov, color=pops))+
    geom_point(position=position_dodge(width = 0.7), size=4)+
    ylab("Covariance")+xlab('')+theme_classic()+
    geom_hline(yintercept = 0,color="gray70", size=0.3)+
    scale_color_manual(values=colors2[c(4,1,3)])+
    theme(legend.title = element_blank())+
    scale_y_continuous(labels = comma)+
     ylim(-0.0013, 0.002)+
    geom_errorbar(aes(ymin=ci_l, ymax=ci_h), width=.2, size=.2, position=position_dodge(width = 0.7))+
    geom_vline(xintercept = c(1.5,2.5), color="gray", size=0.3)
ggsave("../Output/COV/Interpop_cov_comparison_3PopsLonogerPeriod_PointPlot.png",width = 4.7, height = 3, dpi=300)

6 Focused freq analysis

6.1 ccr6a (chr15: 16,066,502 - 16,091,639)

pops<-c("PWS91","PWS96","PWS07","PWS17")
yr<-c(1991,1996,2007,2017)
maf<-data.frame()
for (i in 1:4){
    af<-read.table(paste0("../Data/new_vcf/AF/",pops[i],".mafs"),sep="\t", header = T)
    af<-af[af$chromo=="chr15"&af$position>=16050000&af$position<=16100000,]
    af$year<-yr[i]
    maf<-rbind(maf,af)
}
write.csv(maf,"../Output/COV/COVscan_3pop/cutoff/ccrc6_MAFchange_chr15_16Mb.csv")

positions<-unique(maf$position)
for (i in 1:length(positions)){
    df<-maf[maf$position==positions[i],]
    #AF both decreased
    if (df$knownEM[df$year==1991]>df$knownEM[df$year==1996] & df$knownEM[df$year==1996]>df$knownEM[df$year==2007]){
        maf$trend[maf$position==positions[i]]<-"down"
    }
    else if (df$knownEM[df$year==1991]<df$knownEM[df$year==1996] & df$knownEM[df$year==1996]<df$knownEM[df$year==2007]){
        maf$trend[maf$position==positions[i]]<-"up"
    }
    
    else maf$trend[maf$position==positions[i]]<-"none"
}

write.csv(maf,"../Output/COV/COVscan_3pop/cutoff/ccrc6_MAFchange_chr15_16Mb.csv")

ggplot(maf, aes(x=year, y=knownEM, color=factor(position)))+
    geom_point(size=1.5)+
    geom_line(size=0.3)+ggtitle("CCRC6 gene AF changes")+
    ylab("maf")+
    theme(legend.title=element_blank())+
    theme_minimal()+theme(legend.text = element_text(size=5), legend.title=element_blank())
ggsave("../Output/COV/COVscan_3pop/cutoff/PWS_ccr6c_AFchange_ch15.png", width = 6, height=3, dpi=300)

#color by trend
maf$trend<-factor(maf$trend, levels=c("up","down","none"))
ggplot(maf, aes(x=year, y=knownEM, color=trend))+
    geom_point(size=1.5)+
    geom_path(aes(group=position), size=0.3)+ggtitle("CCRC6 gene AF changes 1991-2007")+
    ylab("maf")+
    theme(legend.title=element_blank())+
    theme_minimal()+theme(legend.title=element_blank())+
    scale_color_manual("Trend",values=c("deeppink2","royalblue", "gray"))
ggsave("../Output/COV/COVscan_3pop/cutoff/PWS_ccr6c_AFchange_ch15_trend.png", width = 6, height=3, dpi=300)

up<-maf[maf$trend=="up",]
down<-maf[maf$trend=="down",]
library(ggrepel)

# Plot separately

up<-up %>% mutate(label = if_else(year == max(year), as.character(position), NA_character_))
ggplot(up, aes(x=year, y=knownEM))+
    geom_point(size=1.5,color="deeppink2", alpha=0.8)+
    geom_path(aes(group=position), size=0.3,color="deeppink2" )+ggtitle(paste0("CCRC6 AF changes 1991-2007, Up ",length((unique(maf$position[maf$trend=="up"])))," loci"))+
    ylab("maf")+
    theme(legend.title=element_blank())+
    theme_minimal()+theme(legend.title=element_blank())+
    geom_label_repel(aes(label = label), label.size=0.1, size = 2,
                  nudge_x = 2,
                  na.rm = TRUE)
ggsave("../Output/COV/COVscan_3pop/cutoff/PWS_ccr6c_AFchange_ch15_Up.png", width = 6, height=3, dpi=300)

down<-down %>% mutate(label = if_else(year == max(year), as.character(position), NA_character_))
ggplot(down, aes(x=year, y=knownEM))+
    geom_point(size=1.5,color="royalblue", alpha=0.8)+
    geom_path(aes(group=position), size=0.3,color="royalblue" )+ggtitle(paste0("CCRC6 AF changes 1991-2007, Up ",length((unique(maf$position[maf$trend=="down"])))," loci"))+
    ylab("maf")+
    theme(legend.title=element_blank())+
    theme_minimal()+theme(legend.title=element_blank())+
     geom_label_repel(aes(label = label), label.size=0.1, size = 3,
                  nudge_x = 2,
                  na.rm = TRUE)
ggsave("../Output/COV/COVscan_3pop/cutoff/PWS_ccr6c_AFchange_ch15_down.png", width = 6, height=3, dpi=300)

6.1.1 AF changes in all 3 pops

###TB
pops<-c("TB91","TB96","TB06","TB17","PWS91","PWS96","PWS07","PWS17","SS96","SS06","SS17")
yr<-c(1991,1996,2006,2017,1991,1996,2007,2017,1996,2006,2017)
maf<-data.frame()
for (i in 1:length(pops)){
    af<-read.table(paste0("../Data/new_vcf/AF/",pops[i],".mafs"),sep="\t", header = T)
    af<-af[af$chromo=="chr15"&af$position>=16050000&af$position<=16100000,]
    af$year<-yr[i]
    af$pop<-sub("\\d\\d","", pops[i])
    maf<-rbind(maf,af)
}
#write.csv(maf,"../Output/COV/COVscan_3pop/AF_maf_chr13_23Mb.csv")


ggplot(maf, aes(x=year, y=knownEM, color=pop))+
    facet_wrap(~factor(position))+
    geom_point(size=1.5)+
    geom_path(linewidth=0.6)+ggtitle("CCRC6 gene (chr15)")+
    ylab("maf")+
    theme(legend.title=element_blank())+
    scale_color_manual(values=cols[c(2,3,1)])
ggsave("../Output/COV/COVscan_3pop/cutoff/ccrc6_AFchanges_allPops.png", width = 8, height=6, dpi=300)

---
title: "COV scan 3 populations"
output:
  html_notebook:
      toc: true 
      toc_float: true
      number_sections: true
      theme: lumen
      highlight: tango
      code_folding: hide
      df_print: paged
---

* This notebook summarizes the results from temporal covariance analysis from CVTKPY.  
* The covariance window size used was 100k (but the window sizes did not affect the results) 

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
source("../Rscripts/BaseScripts.R")
library(tidyverse)
library(dplyr)
library(cowplot)
library(scales)
library(ggvenn)
```

# Results from CVTKPY: genome-wide temporal covariances of allele frequencies 

```{r eval=FALSE, message=FALSE, warning=FALSE}
pops<-c("PWS","TB","SS")

covs<-data.frame()
for (p in 1: length(pops)){
    #covariance output file
    cov<-read.csv(paste0("~/Projects/Pacherring_Vincent/MD7000/3Pops_maf05_temp_cov_matrix_",pops[p],"_100k.csv"))
    cov<-cov[,-1]
        
    #reshape the matrix
    mat1<-cov[1:3,]
    mat2<-cov[4:6,]
        
    covdf<-data.frame()
    k=1
    for (i in 1:nrow(mat1)){
        for (j in 1:ncol(mat1)){
            covdf[k,1]<-mat2[i,j]
            covdf[k,2]<-mat1[i,j]
            k=k+1
        }
    }
    colnames(covdf)<-c("label","value")
    covdf$value<-as.numeric(covdf$value)
    covar<-covdf[grep("cov",covdf$label),]
        
    #remove the redundant values
    if (pops[p]!="SS") covar<-covar[!duplicated(covar[, 2]),] 
    if (pops[p]=="SS") covar<-covar[c(1,2,4),]
        
    #assign the starting time period and covering period values
    covar$year<-c(1,2,2)
    covar$series<-c("1991","1991","1996")
        
    #assign population name
    covar$location<-pops[p]
    
    #combine in to one matrix
    covs<-rbind(covs, covar)
}

covs$time<-rep(c("cov12","cov13","cov23"), 3)
colnames(covs)[2]<-"cov"

# 95% confidence intervals (calculated from the 'straps' returned from bootstrap_cov2() ci=1.96*sd(straps))
time<-c("cov12","cov13","cov23")

covs$ci<-NA
for (i in 1:length(pops)){  
     if (i!=3){
        df<-read.csv(paste0("~/Projects/Pacherring_Vincent/MD7000/",pops[i],"_CIs_100kwindow.csv"), header=F)
        covs$ci[covs$location==pops[i]&time=='cov12']<-df[1,2]
        covs$ci[covs$location==pops[i]&time=='cov13']<-df[1,3]
        covs$ci[covs$location==pops[i]&time=='cov23']<-df[2,3]
    }
    if (p==3) {
        df<-read.csv(paste0("~/Projects/Pacherring_Vincent/MD7000/",pops[i],"_CIs_100kwindow.csv"), header=F)
        covs$ci[covs$location==pops[i]&time=='cov23']<-df[1,2] 
    }
}

write.csv(covs,"../Output/COV/GW_covariance_CIs.csv")

xtexts<-c("\u03941991-1996\n ~ \u03941996-2006", "\n  ~ \u03942006-2017")

ggplot(data=covs, aes(x=year, y=cov, color=location, shape=series, group=interaction(location, series)))+
        geom_point(size=3, position=position_dodge(width = 0.1,preserve ="total"))+
        geom_line(data=covs, aes(x=year, y=cov,color=location, group=interaction(location, series)), position=position_dodge(width = 0.1,preserve ="total"))+
        ylab("Covariance")+xlab('')+theme_classic()+
        theme(legend.title = element_blank())+
        geom_hline(yintercept = 0,color="gray70", size=0.3)+
        geom_errorbar(aes(ymin=cov-ci, ymax=cov+ci), width=.2, size=.2, position=position_dodge(width = 0.1,preserve ="total"))+
        scale_shape_manual(values=c(16,17),labels=c("\u0394'91-'96~","\u0394'96-'06~"))+
        scale_x_continuous(breaks = c(1,2), labels=xtexts)+
        scale_color_manual(values=cols[c(2,3,1)])+ylim(-0.0023,0.002)
ggsave(paste0("../Output/COV/3Pops_Cov_overtime_CIestimated.png"),width = 4.7, height = 3, dpi=300)
    
covs$time<-factor(covs$time, levels=c("cov12","cov23","cov13"))
#xtexts<-c("\u03941991-1996\n ~ \u03941996-2006", "\u03941996-2006\n  ~ \u03942006-2017", "\u03941991-1996\n  ~ \u03942006-2017")
xtexts<-c("\u0394'91-'96\n ~ \u0394'96-'06", "\u0394'96-'06\n  ~ \u0394'06-'17", "\u0394'91-'96\n  ~ \u0394'06-'17")

ggplot(data=covs, aes(x=time, y=cov, color=location))+
        geom_point(size=3, position=position_dodge(width = 0.1,preserve ="total"))+
        #geom_line(data=covs, aes(x=year, y=cov,color=location, group=interaction(location, series)), position=position_dodge(width = 0.1,preserve ="total"))+
        ylab("Covariance")+xlab('')+theme_classic()+
        theme(legend.title = element_blank(), axis.text.x = element_text(size=9))+
        geom_hline(yintercept = 0,color="gray70", size=0.3)+
        geom_errorbar(aes(ymin=cov-ci, ymax=cov+ci), width=.2, size=.2, position=position_dodge(width = 0.1,preserve ="total"))+
        scale_x_discrete(labels=xtexts)+
    scale_color_manual(values=cols[c(2,3,1)])+
    geom_vline(xintercept = c(1.5,2.5), color="gray", size=0.2)+ylim(-0.0023,0.002)
ggsave(paste0("../Output/COV/3Pops_Cov_CI_3timepoints.png"),width = 4.57, height = 3, dpi=300)

```
![](../Output/COV/3Pops_Cov_overtime_CIestimated.png)

![](../Output/COV/3Pops_Cov_CI_3timepoints.png)

# Find regions with high covariances in each population
* From Temporal Covariance analysis  -output covariances for each time period

## Plot the covariances across the genome  

```{r eval=FALSE, message=FALSE, warning=FALSE}

#Find the regions with a high temporal covariance 
pops<-c("PWS","TB","SS")
winsize<-"100k"
evens<-paste0("chr",seq(2,26, by=2))
cov.list<-list()
covs_all<-list()
k=1
for (p in 1: length(pops)){
    pop<-pops[p]
    iv<-read.csv(paste0("~/Projects/Pacherring_Vincent/MD7000/3pops_intervals_",winsize,"window.csv"), row.names = 1)
    if (p==3) {
        cov23<-read.csv(paste0("~/Projects/Pacherring_Vincent/MD7000/",pop,"_cov23_2017-2006_2006-1996_3Pops_",winsize,"window.csv"), header = F)
        covs<-cbind(iv, cov23)
        colnames(covs)[4]<-c("cov23")
        covs$index=1:nrow(covs)
        covs$color<-"col1"
        covs$color[covs$chrom %in% evens]<-"col2"

        covs[sapply(covs, is.infinite)] <- NA
        covs[sapply(covs, is.nan)] <- NA
        
        cov.list[[k]]<-covs
        names(cov.list)[k]<-paste0(pop,"_",winsize)    
        k=k+1
            
        y<-min(covs$cov23, na.rm=T)
        ymin<-ifelse (y<=-0.1,-0.1, y) 
        ymax<-max(covs$cov23, na.rm=T)
        ggplot(covs, aes(x=index, y=cov23, color=color))+
            geom_point(size=1, alpha=0.5)+
            theme_classic()+
            ylim(ymin,ymax)+
            scale_color_manual(values=c("gray70","steelblue"), guide="none")+
            ylab("Covariance")+xlab('Chromosome')+
            theme(axis.text.x = element_blank())+
            ggtitle(paste0(pop," ", winsize," window"))
        #ggsave(paste0("../Output/COV/3Pops.",pop,"_tempCovs_acrossGenome_",winsize[i], "Window.png"), width = 8, height = 2.7, dpi=300) 
        }
    else {
        cov12<-read.csv(paste0("~/Projects/Pacherring_Vincent/MD7000/",pop,"_cov12_1996-1991_2006-1996_3Pops_",winsize,"window.csv"), header = F)
        cov23<-read.csv(paste0("~/Projects/Pacherring_Vincent/MD7000/",pop,"_cov23_2017-2006_2006-1996_3Pops_",winsize,"window.csv"), header = F)
        cov13<-read.csv(paste0("~/Projects/Pacherring_Vincent/MD7000/",pop,"_cov13_2017-2006_1996-1991_3Pops_",winsize,"window.csv"), header = F)
        covs<-cbind(iv, cov12, cov23,cov13)
        colnames(covs)[4:6]<-c("cov12","cov23","cov13")
        covs$index=1:nrow(covs)
    
        covs$color<-"col1"
        covs$color[covs$chrom %in% evens]<-"col2"
    
        covs[sapply(covs, is.infinite)] <- NA
        covs[sapply(covs, is.nan)] <- NA
        
        cov.list[[k]]<-covs
        names(cov.list)[k]<-paste0(pop,"_",winsize)    
        k=k+1
        covsm<-melt(covs[,c("index","color","cov12","cov23","cov13")], id.vars = c("index", "color"))
        ymax<-max(covsm$value, na.rm=T)
        y<-min(covsm$value, na.rm=T)
        ymin<-ifelse (y<=-0.1,-0.1, y) 
        ggplot(covsm, aes(x=index, y=value, color=color))+
            facet_wrap(~variable, nrow=3)+
            geom_point(size=1, alpha=0.5)+
            theme_classic()+
            ylim(ymin,ymax)+
            scale_color_manual(values=c("gray70","steelblue"), guide="none")+
            ylab("Covariance")+xlab('Chromosome')+
            theme(axis.text.x = element_blank())+
            ggtitle(paste0(pop," ", winsize," window"))
        #ggsave(paste0("../Output/COV/3Pops.",pop,"_tempCovs_acrossGenome_",winsize, "Window.png"), width = 8, height = 8, dpi=300)    
    }
}
```

![](../Output/COV/3Pops.PWS_tempCovs_acrossGenome_100kWindow.png){width=65%}

![](../Output/COV/3Pops.TB_tempCovs_acrossGenome_100kWindow.png){width=65%}   

![](../Output/COV/3Pops.SS_tempCovs_acrossGenome_100kWindow.png){width=65%}  


# Find the covariance lower cut off values  
```{r eval=FALSE, message=FALSE, warning=FALSE}

cv<-c("cov12","cov13","cov23")
cvrange<-data.frame(pop=c(paste0(pops[1:2],"_", cv[1]),paste0(pops[1:2],"_", cv[2]),paste0(pops,"_", cv[3])))
k=1
for (i in 1:length(cv)){
    if (i==1|i==2){
        if (i==1) k=1
        if (i==2) k=3
        #PWS
        df1<-cov.list[[paste0("PWS_100k")]]
        df1<-df1[order(df1[,cv[i]], decreasing=T),]
        n<-ceiling(nrow(df1)*0.01) #top1% region
        df1$top1<-"N"
        df1$top1[1:n]<-"PWS"
        rg<-range(df1[df1$top1=="PWS",cv[i]], na.rm=T)
        cvrange[k,"100k"]<-paste0(rg[1],"-",rg[2])
          
        #tb
        df2<-cov.list[["TB_100k"]]
        df2<-df2[order(df2[,cv[i]], decreasing=T),]
        df2$top1<-"N"
        df2$top1[1:n]<-"TB"
        rg2<-range(df2[df2$top1=="TB", cv[i]], na.rm=T)
        cvrange[(k+1),"100k"]<-paste0(rg2[1],"-",rg2[2])
    }
   
    if (i==3){
        k=5
        #pws
        df1<-cov.list[["PWS_100k"]]
        df1<-df1[,c("chrom","start","end","cov23")]
        df1<-df1[order(df1$cov23, decreasing=T),]
        n<-ceiling(nrow(df1)*0.01) #top1% region
        df1$top1<-"N"
        df1$top1[1:n]<-"PWS"
        
        rg<-range(df1[df1$top1=="PWS",cv[i]], na.rm=T)
        cvrange[k,"100k"]<-paste0(rg[1],"-",rg[2])
           
        #tb
        df2<-cov.list[["TB_100k"]]
        df2<-df2[,c("chrom","start","end","cov23")]
        df2<-df2[order(df2$cov23, decreasing=T),]
        df2$top1<-"N"
        df2$top1[1:n]<-"TB"
        rg2<-range(df2[df2$top1=="TB", cv[i]], na.rm=T)
        cvrange[(k+1),"100k"]<-paste0(rg2[1],"-",rg2[2])
    
        #ss
        df3<-cov.list[["SS_100k"]]
        df3<-df3[,c("chrom","start","end","cov23")]
        df3<-df3[order(df3$cov23, decreasing=T),]
        df3$top1<-"N"
        df3$top1[1:n]<-"SS"
        rg3<-range(df3[df3$top1=="SS", cv[i]], na.rm=T)
        cvrange[(k+2),"100k"]<-paste0(rg3[1],"-",rg3[2])
        }
    }
}

cvs<-melt(cvrange, id.vars = "pop")
cvs<-cvs %>%
  separate(value, c("low", "high"), "-")
cvs$low<-as.numeric(cvs$low)
cvs$high<-as.numeric(cvs$high)
cvs<-cvs%>%
  separate(pop, c("pop", "cov"), "_")

ggplot(cvs, aes(x=cov, y=high, fill=pop))+
    geom_crossbar(aes(ymin=low, ymax=high), width=0.5, position=position_dodge(width = 1))+
    ylab("Range of covariances")+
    theme_light()+xlab("")+
    geom_vline(xintercept=c(1.5,2.5), color="gray")+
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.title=element_blank())+
    ggtitle("Top1% Cov Range")
ggsave("../Output/COV/COVscan_3pop/TempCov_Range_comparison_100k.png", width = 5, height = 3, dpi=300)

ggplot(cvs, aes(x=cov, y=low, color=pop))+
    geom_point()+
    ylab("Lower limit of top 1% covariance")+
    theme_light()+xlab("")+
    geom_vline(xintercept=c(1.5,2.5), color="gray")+
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.title=element_blank())
ggsave("../Output/COV/COVscan_3pop/TempCov_Range_lowLimit_comparison_100k.png", width = 5, height = 3, dpi=300)

```

![](../Output/COV/COVscan_3pop/TempCov_Range_comparison_100k.png)


![](../Output/COV/COVscan_3pop/TempCov_Range_lowLimit_comparison_100k.png)

## Use the lowest covariance values for each period to definte outlier regions 

```{r eval=FALSE, message=FALSE, warning=FALSE}

lows<-aggregate(cvs$low, by=list(cvs$cov), min)
names(lows)<-c("cov","low")
#low cutoff for each time period (100k-window)
#     cov        low
#1 cov12 0.02874841
#2 cov13 0.03102712
#3 cov23 0.03246524


# Outliers based on the new low cut-off values 100k window. 
cov12<-data.frame()
cov23<-data.frame()
cov13<-data.frame()

for (i in 1:length(cov.list)){
 #PWS and TB
  if (i==1|i==2){
    covs<-cov.list[[i]]
    pop<-gsub("_.+",'', names(cov.list)[i])
  
    #outlier cutoff value
    x<-lows$low[lows$cov=="cov12"]
    covs12_top<-subset(covs, cov12>=x)
    covs12_top<-covs12_top[order(covs12_top$chrom, covs12_top$start),]
    covs12_top$pop<-pop
    cov12<-rbind(cov12, covs12_top)
    
    covs<-covs[order(covs$cov13, decreasing=T),]
    x<-lows$low[lows$cov=="cov13"]
    covs13_top<-subset(covs, cov13>=x)
    covs13_top<-covs13_top[order(covs13_top$chrom, covs13_top$start),]
    covs13_top$pop<-pop
    cov13<-rbind(cov13, covs13_top)
    
    covs<-covs[order(covs$cov23, decreasing=T),]
    x<-lows$low[lows$cov=="cov23"]
    covs23_top<-subset(covs[,c("chrom","start","end","cov23","index","color")], cov23>=x)
    covs23_top<-covs23_top[order(covs23_top$chrom, covs23_top$start),]
    covs23_top$pop<-pop
    cov23<-rbind(cov23, covs23_top)
 }
 if (grepl("SS",names(cov.list)[i])){
    covs<-cov.list[[i]]
    
    pop<-gsub("_.+",'', names(cov.list)[i])
    win<-gsub(paste0(pop,"_"), '', names(cov.list)[i])
    
    covs<-covs[order(covs$cov23, decreasing=T),]
    x<-lows$low[lows$cov=="cov23"]
    covs23_top<-subset(covs, cov23>=x)
    covs23_top<-covs23_top[order(covs23_top$chrom, covs23_top$start),]
    covs23_top$pop<-pop
    cov23<-rbind(cov23, covs23_top)
    }
}

write.csv(cov12, "../Output/COV/COVscan_3pop/cutoff/3pops_top1percent_outlier_cutoff.cov12.csv",row.names = F)
write.csv(cov23, "../Output/COV/COVscan_3pop/cutoff/3pops_top1percent_outlier_cutoff.cov23.csv",row.names = F)
write.csv(cov13, "../Output/COV/COVscan_3pop/cutoff/3pops_top1percent_outlier_cutoff.cov13.csv",row.names = F)
```


### Strickter covariance cutoff 
```{r eval=FALSE, message=FALSE, warning=FALSE}
#
cov12<-data.frame()
cov23<-data.frame()
cov13<-data.frame()
names(cov.list)
for (i in 1:length(cov.list)){
 #PWS and TB
  if (i==1|i==2){
    covs<-cov.list[[i]]
    pop<-gsub("_.+",'', names(cov.list)[i])
    
    plot(covs$cov12)
  
    #outlier cutoff value
    x<-lows$low[lows$cov=="cov12"]
    covs12_top<-subset(covs, cov12>=x)
    # cov cutoff at 0.035
    c12<-covs12_top[covs12_top$cov12 >0.035,] 
    
    #create a bed file for the reion
    df<-c12[,c("chrom","start","end")]
    #add 100k
    df$start<-df$start-100000
    df$end<-df$end+100000
    dfp<-df[df$pop=="PWS",1:3]
    colnames(dfp)<-c('track type=bedGraph', '1','1')
    write.table(dfp, paste0("../Output/COV/COVscan_3pop/cutoff/PWS_outliers_",cv[i],"_new.bed"),quote = F, row.names = F, col.names = T,sep = "\t")
  
    
    covs12_top<-covs12_top[order(covs12_top$chrom, covs12_top$start),]
    covs12_top$pop<-pop
    cov12<-rbind(cov12, covs12_top)
    
    covs<-covs[order(covs$cov13, decreasing=T),]
    x<-lows$low[lows$cov=="cov13"]
    covs13_top<-subset(covs, cov13>=x)
    covs13_top<-covs13_top[order(covs13_top$chrom, covs13_top$start),]
    covs13_top$pop<-pop
    cov13<-rbind(cov13, covs13_top)
    
    covs<-covs[order(covs$cov23, decreasing=T),]
    x<-lows$low[lows$cov=="cov23"]
    covs23_top<-subset(covs[,c("chrom","start","end","cov23","index","color")], cov23>=x)
    covs23_top<-covs23_top[order(covs23_top$chrom, covs23_top$start),]
    covs23_top$pop<-pop
    cov23<-rbind(cov23, covs23_top)
 }
 if (grepl("SS",names(cov.list)[i])){
    covs<-cov.list[[i]]
    
    pop<-gsub("_.+",'', names(cov.list)[i])
    win<-gsub(paste0(pop,"_"), '', names(cov.list)[i])
    
    covs<-covs[order(covs$cov23, decreasing=T),]
    x<-lows$low[lows$cov=="cov23"]
    covs23_top<-subset(covs, cov23>=x)
    covs23_top<-covs23_top[order(covs23_top$chrom, covs23_top$start),]
    covs23_top$pop<-pop
    cov23<-rbind(cov23, covs23_top)
    }
}
```


## Create plots with different colors for outliers

```{r eval=FALSE, message=FALSE, warning=FALSE}
#for COV12 and COV13 for TB and PWS (100K)
cv<-c("cov12","cov13","cov23")

for (i in 1:length(cv)){
    if (i==1|i==2){
        #cutoff value
        x<-lows$low[lows$cov==cv[i]]
        #PWS
        df1<-cov.list[["PWS_100k"]]
        df1<-df1[order(df1[,cv[i]], decreasing=T),]
        df1$top1<-"N"
        df1$top1[df1[,cv[i]]>=x]<-"PWS"
        
        #tb
        df2<-cov.list[["TB_100k"]]
        df2<-df2[order(df2[,cv[i]], decreasing=T),]
        df2$top1<-"N"
        df2$top1[df2[,cv[i]]>=x]<-"TB"
        
        #Combine PWS and TB tables
        co<-rbind(df1, df2)
        co$chrom<-factor(co$chrom, levels=paste0("chr", 1:26))
        co$top1<-factor(co$top1, levels=c("PWS","TB","N"))
        colnames(co)[which(colnames(co)==cv[i])]<-"cov"
    
        ymax<-max(co$cov, na.rm=T)
        #Plot each genome separately
        ggplot(co, aes(x=start/1000000, y=cov, color=top1))+
            geom_point(size=0.5)+
            facet_wrap(~chrom, ncol=4)+
            theme_classic()+ylim(-0.1,ymax)+
            scale_color_manual(values=c(paste0(cols[2],"B3"),paste0(cols[1],"B3") ,"#C0C0C080"), labels=c("PWS", "TB", ""))+
            ylab("Covariance")+xlab('Postion (Mb)')+
            ggtitle(cv[i])+
            scale_x_continuous(labels = comma)+
            guides(color = guide_legend(override.aes = list(color=c(cols[2],cols[1],"white"),size=2), title=element_text("Top 1%")))
   
        ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/3Pops.",cv[i],"_perChrom_100k_Window_Outliers.png"), width = 10, height = 8, dpi=300)
        
        #Whole genome in 1 plot 
        #assign colors
        co$top1<-apply(co, 1, function(x) {ifelse (x['top1']=="N", x['color'], x['top1'])} )
        co$top1<-factor(co$top1, levels=c("PWS","TB","col1","col2"))
        
        #count the number of sites per chromosomes
        poss<-data.frame(chr=paste0("chr",1:26))
        k=1
        for (j in 1:26){
            df<-df1[df1$chr==paste0("chr",j),]
            poss$start[j]<-k
            poss$end[j]<-k+nrow(df)-1
            k=k+nrow(df)
        }
        poss$x<-poss$start+(poss$end-poss$start)/2
        ymax<-max(co$cov, na.rm=T)
        ggplot(co, aes(x=index, y=cov, color=top1))+
            geom_point(size=0.5)+
            theme_classic()+ylim(-0.1,ymax)+
            scale_color_manual(values=c(paste0(cols[2],"B3"),paste0(cols[1],"B3"),"#A8BBCD66","#D6D6D666"), labels=c("PWS", "TB", "",""))+
            ylab("Covariance")+
            ggtitle(paste0(" 100k window ",cv[i]))+
            guides(color = guide_legend(override.aes = list(color=c(cols[2], cols[1],"white","white"), size=2), title=element_text("Outlier Region", size=10)))+
            scale_x_continuous(name="Chromosome", breaks=poss$x, labels=1:26)
        ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/3Pops.",cv[i],"_100k_Window_Outliers.png"), width = 10, height = 3.5, dpi=300)
    }
   
    if (i==3){
       #cutoff value
        x<-lows$low[lows$cov==cv[i]]
        #PWS
        df1<-cov.list[["PWS_100k"]]
        df1<-df1[,c("chrom","start","end","cov23","index","color")]
        df1<-df1[order(df1$cov23, decreasing=T),]
        df1$top1<-"N"
        df1$top1[df1[,cv[i]]>=x]<-"PWS"
        
        #tb
        df2<-cov.list[["TB_100k"]]
        df2<-df2[,c("chrom","start","end","cov23","index","color")]
        df2<-df2[order(df2$cov23, decreasing=T),]
        df2$top1<-"N"
        df2$top1[df2[,cv[i]]>=x]<-"TB"
    
        #ss
        df3<-cov.list[["SS_100k"]]
        df3<-df3[,c("chrom","start","end","cov23","index","color")]
        df3<-df3[order(df3$cov23, decreasing=T),]
        df3$top1<-"N"
        df3$top1[df3[,cv[i]]>=x]<-"SS"

        co<-rbind(df1,df2,df3)

        co$chrom<-factor(co$chrom, levels=paste0("chr", 1:26))
        co$top1<-factor(co$top1, levels=c("PWS","TB","SS","N"))
        ymax<-max(co$cov23, na.rm=T)
        ggplot(co, aes(x=start/1000000, y=cov23, color=top1))+
            geom_point(size=0.6)+
            facet_wrap(~chrom, ncol=4)+
            theme_classic()+ylim(-0.1,ymax)+
            ylab("Covariance")+xlab('Postion (Mb)')+
            ggtitle(cv[i])+
            scale_x_continuous(labels = comma)+
            #scale_color_discrete(breaks=c("PWS","SS","TB"))+
            scale_color_manual(values=c(paste0(cols[c(2,1,3)],"B3"),"#C0C0C088"), labels=c("PWS","TB","SS", ""))+
            guides(color = guide_legend(override.aes = list(color=c(cols[c(2,1,3)],"white"), size=2),title=element_text("Top 1% outliers"))) 
        ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/3Pops.cov23_perChrom_100k_Window_Outliers.png"), width = 10, height = 9, dpi=300)
        
        #assign colors
        co$top1<-apply(co, 1, function(x) {ifelse (x['top1']=="N", x['color'], x['top1'])} )
        co$top1<-factor(co$top1, levels=c("PWS","TB","SS","col1","col2"))
        #count the number of sites per chromosomes
        poss<-data.frame(chr=paste0("chr",1:26))
        k=1
        for (j in 1:26){
            df<-df1[df1$chr==paste0("chr",j),]
            poss$start[j]<-k
            poss$end[j]<-k+nrow(df)-1
            k=k+nrow(df)
        }
        poss$x<-poss$start+(poss$end-poss$start)/2
        ymax<-max(co$cov, na.rm=T)
        ggplot(co, aes(x=index, y=cov23, color=top1))+
            geom_point(size=0.5)+
            theme_classic()+ylim(-0.1,ymax)+
            scale_color_manual(values=c(paste0(cols[c(2,1,3)],"B3"),"#A8BBCD66","#D6D6D666"), labels=c("PWS", "TB","SS", "",""))+
                ylab("Covariance")+
                ggtitle(paste0(" 100k window ",cv[i]))+
                guides(color = guide_legend(override.aes = list(color=c(cols[c(2,1,3)],"white","white"), size=2), title=element_text("Outlier (1%)")))+
            scale_x_continuous(name="Chromosome", breaks=poss$x, labels=1:26)+
            theme(legend.title = element_text(size=10))
        ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/3Pops.",cv[i],"_100k_Window_Outliers.png"), width = 10, height = 3.5, dpi=300)
        }
        
}

```

![](../Output/COV/COVscan_3pop/cutoff/3Pops.cov12_perChrom_100k_Window_Outliers.png)
![](../Output/COV/COVscan_3pop/cutoff/3Pops.cov13_perChrom_100k_Window_Outliers.png)


![](../Output/COV/COVscan_3pop/cutoff/3Pops.cov23_perChrom_100k_Window_Outliers.png)  

![](../Output/COV/COVscan_3pop/cutoff/3Pops.cov12_100k_Window_Outliers.png)

![](../Output/COV/COVscan_3pop/cutoff/3Pops.cov13_100k_Window_Outliers.png)
![](../Output/COV/COVscan_3pop/cutoff/3Pops.cov23_100k_Window_Outliers.png) 

### Whole genome plots all time priods for PWS and TB

```{r eval=FALSE, message=FALSE, warning=FALSE}
## Plot 3 time periods together for PWS and TB
Cov<-data.frame()
for (i in 1:length(cv)){
    #cutoff value
    x<-lows$low[lows$cov==cv[i]]
    #PWS
    df1<-cov.list[["PWS_100k"]]
    df1<-df1[order(df1[,cv[i]], decreasing=T),]
    df1$top1<-"N"
    df1$top1[df1[,cv[i]]>=x]<-"PWS"
    
    #tb
    df2<-cov.list[["TB_100k"]]
    df2<-df2[order(df2[,cv[i]], decreasing=T),]
    df2$top1<-"N"
    df2$top1[df2[,cv[i]]>=x]<-"TB"
    
    #Combine PWS and TB tables
    co<-rbind(df1, df2)
    co$chrom<-factor(co$chrom, levels=paste0("chr", 1:26))
    colnames(co)[which(colnames(co)==cv[i])]<-"cov"
    #assgin colors
    co$top1<-apply(co, 1, function(x) {ifelse (x['top1']=="N", x['color'], x['top1'])} )
    co$top1<-factor(co$top1, levels=c("PWS","TB","col1","col2"))
    co$time<-cv[i]
    
    Cov<-rbind(Cov, co[,c("index", "cov","top1","time")])
}

#count the number of sites per chromosomes
df1<-cov.list[["PWS_100k"]]
poss<-data.frame(chr=paste0("chr",1:26))
k=1
for (j in 1:26){
        df<-df1[df1$chr==paste0("chr",j),]
        poss$start[j]<-k
        poss$end[j]<-k+nrow(df)-1
        k=k+nrow(df)
}
poss$x<-poss$start+(poss$end-poss$start)/2
ymax<-max(co$cov, na.rm=T)
ggplot(Cov, aes(x=index, y=cov, color=top1))+
    facet_wrap(~time, ncol=1)+
    geom_point(size=0.5)+
    theme_classic()+ylim(-0.1,ymax)+
    scale_color_manual(values=c(paste0(cols[c(2,1)],"B3"),"#A8BBCD66","#D6D6D666"), labels=c("PWS", "TB", "",""))+
    ylab("Covariance")+
    guides(color = guide_legend(override.aes = list(color=c(cols[c(2,1)],"white","white"), size=2), title=element_text("Outlier", size=10)))+
    scale_x_continuous(name="Chromosome", breaks=poss$x, labels=1:26)

ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/PWS_TB_100k_Window_Outliers.png"), width = 11, height = 5, dpi=300)
        }}


```

![](../Output/COV/COVscan_3pop/cutoff/PWS_TB_100k_Window_Outliers.png)




## Overlapping outlier regions between different populations 
```{r eval=FALSE, message=FALSE, warning=FALSE}
#100k
cv<-c("cov12","cov13","cov23")
pairs<-t(combn(pops, 2))
pairs<-data.frame(pairs)
colnames(pairs)<-paste0("pop",1:2)
Ov_direct<-data.frame(cov=c(cv[1:2],"cov23-PT","cov23-PS","cov23-ST" ,"cov23-3"))
Ov_300<-data.frame(cov=c(cv[1:2],"cov23-PT","cov23-PS","cov23-ST" ,"cov23-3"))
for (i in 1:length(cv)){
    df<-read.csv(paste0("../Output/COV/COVscan_3pop/cutoff/3pops_top1percent_outlier_cutoff.", cv[i], ".csv"))
    df$id<-paste0(df$chrom,"_",df$start)
    
    if (i!=3){
        #exact overlaps
        isec<-intersect(df$id[df$pop=="PWS"], df$id[df$pop=="TB"]) 
        Ov_direct$count[i]<-length(isec)
        
        #### Check chromosome region overlap +-200,000 bases
        pop1<-df[df$pop=="PWS",]
        pop2<-df[df$pop=="TB",]
        overlps<-data.frame()
        overlps2<-data.frame()
        for (n in 1: nrow(pop1)){
            re<-pop2[pop2$chrom==pop1$chrom[n],]
            if (nrow(re)>=1){
                for (s in 1: nrow(re)){
                    if (re$start[s]<=pop1$start[n]+200000 & re$start[s]>=pop1$start[n]-200000){
                        overlps<-rbind(overlps, re[s,])
                        overlps2<-rbind(overlps2,pop1[n,])}
                }
            }
        }
        # Merge two tables into one summary overlap table:
        ov<-data.frame(id=overlps$id)
        for (n in 1: nrow(overlps)){
            if (overlps$start[n]<overlps2$start[n]) {ov$start[n]<-overlps$start[n]; ov$end[n]<-overlps2$end[n]}
            if (overlps$start[n]>=overlps2$start[n]) {ov$start[n]<-overlps2$start[n];ov$end[n]<-overlps$end[n]}
        }
        ov[,"cov.PWS"]<-overlps[,4]
        ov[,"cov.TB"]<-overlps2[,4]
        write.csv(ov, paste0("../Output/COV/COVscan_3pop/cutoff/Overlap_regions_",cv[i],"_plusminus100k.csv"), row.names = F)
        Ov_300$count[i]<-nrow(ov)
        }
        
    if (i==3){
        isec<-intersect(df$id[df$pop=="PWS"], df$id[df$pop=="TB"]) 
        isec2<-intersect(df$id[df$pop=="PWS"], df$id[df$pop=="SS"]) 
        isec3<-intersect(df$id[df$pop=="SS"], df$id[df$pop=="TB"]) 
        Ov_direct$count[i]<-length(isec)
        Ov_direct$count[i+1]<-length(isec2)
        Ov_direct$count[i+2]<-length(isec3)
        Ov_direct$count[i+3]<-length(intersect(df$id[df$pop=="SS"], intersect(df$id[df$pop=="PWS"], df$id[df$pop=="TB"])))
        
        for(j in 1:nrow(pairs)){
        #### Check chromosome region overlap +-200,000 bases
            pop1<-df[df$pop==pairs[j,1],]
            pop2<-df[df$pop==pairs[j,2],]
            overlps<-data.frame()
            overlps2<-data.frame()
            for (n in 1: nrow(pop1)){
                re<-pop2[pop2$chrom==pop1$chrom[n],]
                if (nrow(re)>=1){
                    for (s in 1: nrow(re)){
                        if (re$start[s]<=pop1$start[n]+200000 & re$start[s]>=pop1$start[n]-200000){
                            overlps<-rbind(overlps, re[s,])
                            overlps2<-rbind(overlps2,pop1[n,])}
                    }
                }
            }
        # Merge two tables into one summary overlap table:
            ov<-data.frame(id=overlps$id)
            for (n in 1: nrow(overlps)){
                if (overlps$start[n]<overlps2$start[n]) {ov$start[n]<-overlps$start[n]; ov$end[n]<-overlps2$end[n]}
                if (overlps$start[n]>=overlps2$start[n]) {ov$start[n]<-overlps2$start[n];ov$end[n]<-overlps$end[n]}
            }
        
            ov[,paste0("cov.",pairs[j,1])]<-overlps[,4]
            ov[,paste0("cov.",pairs[j,2])]<-overlps2[,4]
            ov<-ov[!duplicated(ov),]
            write.csv(ov, paste0("../Output/COV/COVscan_3pop/cutoff/Overlap_regions_",cv[i],"_",pairs[j,1],".", pairs[j,2],"_plusminus200k.csv"), row.names = F)
            Ov_300$count[i+j-1]<-nrow(ov)
    }
    }
}
write.csv(Ov_direct, paste0("../Output/COV/COVscan_3pop/cutoff/Direct_Overlapping_regions_counts_3pop_summary.csv"))
Ov_300$count[6]<-NA
write.csv(Ov_300, paste0("../Output/COV/COVscan_3pop/cutoff/Overlapping_regions_counts_3pop_plusMinus200k.csv"))

```


# Run the snpEff pipeline to find annotation in the outlier regions (100k-window+-100k)  

## Create a script to run SnpEff 

Create VCF files with selected regions & run snpEff  
```{r eval=FALSE, message=FALSE, warning=FALSE}
#Create bed files
cv<-c("cov12","cov13","cov23")
#Prevent scientific notation in bed files
options(scipen=999)

#The first line of bed files is often not red by vcftools
for (i in 1:3){
    df<-read.csv(paste0("../Output/COV/COVscan_3pop/cutoff/3pops_top1percent_outlier_cutoff.", cv[i], ".csv"))
    #add 100k
    df$start<-df$start-100000
    df$end<-df$end+100000
    dfp<-df[df$pop=="PWS",1:3]
    colnames(dfp)<-c('track type=bedGraph', '1','1')
    write.table(dfp, paste0("../Output/COV/COVscan_3pop/cutoff/PWS_outliers_",cv[i],"_new.bed"),quote = F, row.names = F, col.names = T,sep = "\t")
    dft<-df[df$pop=="TB",1:3]
    colnames(dft)<-c('track type=bedGraph', '1','1')
    write.table(dft, paste0("../Output/COV/COVscan_3pop/cutoff/TB_outliers_",cv[i],"_new.bed"),quote = F, row.names = F, col.names = F,sep = "\t")
    
    if (i==3){
        dfs<-df[df$pop=="SS",1:3]
        colnames(dfs)<-c('track type=bedGraph', '1','1')
        write.table(dfs, paste0("../Output/COV/COVscan_3pop/cutoff/SS_outliers_",cv[i],"_new.bed"),quote = F, row.names = F, col.names = F,sep = "\t")
    }
}

# Create a bash script to create vcf files with selected regions
bedfiles<-list.files("../Output/COV/COVscan_3pop/cutoff/", pattern="*_new.bed")

sink("../COVscan_createVCFs_3Pops_cutoff.sh")
cat("#!/bin/bash \n\n")
for (i in 1:length(bedfiles)){
    fname<-gsub(".bed",'', bedfiles[i])
    cat(paste0("vcftools --gzvcf Data/new_vcf/3pop/3pops.MD7000_NS0.5_maf05.vcf.gz --bed Output/COV/COVscan_3pop/cutoff/", bedfiles[i], " --out Output/COV/COVscan_3pop/cutoff/", fname," --recode --keep-INFO-all \n"))
}
sink(NULL)  
```


### Create bed files and scripts to run SnpEff with less padded windows (10k)

Create VCF files with selected regions & run snpEff  
```{r eval=FALSE, message=FALSE, warning=FALSE}
#Create bed files
cv<-c("cov12","cov13","cov23")
#Prevent scientific notation in bed files
options(scipen=999)

#The first line of bed files is often not red by vcftools
for (i in 1:3){
    df<-read.csv(paste0("../Output/COV/COVscan_3pop/cutoff/3pops_top1percent_outlier_cutoff.", cv[i], ".csv"))
    #add 10k
    df$start<-df$start-10000
    df$end<-df$end+10000
    dfp<-df[df$pop=="PWS",1:3]
    colnames(dfp)<-c('track type=bedGraph', '1','1')
    write.table(dfp, paste0("../Output/COV/COVscan_3pop/cutoff/PWS_outliers_",cv[i],"_10kpad.bed"),quote = F, row.names = F, col.names = T,sep = "\t")
    dft<-df[df$pop=="TB",1:3]
    colnames(dft)<-c('track type=bedGraph', '1','1')
    write.table(dft, paste0("../Output/COV/COVscan_3pop/cutoff/TB_outliers_",cv[i],"_10kpad.bed"),quote = F, row.names = F, col.names = F,sep = "\t")
    
    if (i==3){
        dfs<-df[df$pop=="SS",1:3]
        colnames(dfs)<-c('track type=bedGraph', '1','1')
        write.table(dfs, paste0("../Output/COV/COVscan_3pop/cutoff/SS_outliers_",cv[i],"_10kpad.bed"),quote = F, row.names = F, col.names = F,sep = "\t")
    }
}

# Create a bash script to create vcf files with selected regions
bedfiles<-list.files("../Output/COV/COVscan_3pop/cutoff/", pattern="*_10kpad.bed")

sink("../COVscan_createVCFs_3Pops_cutoff_10kpad.sh")
cat("#!/bin/bash \n\n")
for (i in 1:length(bedfiles)){
    fname<-gsub(".bed",'', bedfiles[i])
    cat(paste0("vcftools --gzvcf Data/new_vcf/3pop/3pops.MD7000_NS0.5_maf05.vcf.gz --bed Output/COV/COVscan_3pop/cutoff/", bedfiles[i], " --out Output/COV/COVscan_3pop/cutoff/", fname," --recode --keep-INFO-all \n"))
}
sink(NULL)  
```


```{bash eval=FALSE, include=FALSE}
cd ~/Projects/PacHerring
bash COVscan_createVCFs_3pops.sh
```


```{r eval=FALSE, message=FALSE, warning=FALSE}
#create a bash script to run snpEff
vfiles<-list.files("../Output/COV/COVscan_3pop/cutoff/", pattern=".recode.vcf")

sink("~/programs/snpEff/runsnpEff_cov_3pop_cutoff.sh")
cat("#!/bin/bash \n\n")
for (i in 1:length(vfiles)){
    fname<-gsub("_new.recode.vcf","",vfiles[i])
    cat(paste0("java -Xmx8g -jar snpEff.jar Ch_v2.0.2.99 ~/Projects/PacHerring/Output/COV/COVscan_3pop/cutoff/",vfiles[i], " -stats ~/Projects/PacHerring/Output/COV/COVscan_3pop/cutoff/",fname,".html >  ~/Projects/PacHerring/Output/COV/COVscan_3pop/cutoff/Anno.",fname,".vcf \n"))
    
    #extract the annotation information
    cat(paste0("bcftools query -f '%CHROM %POS %INFO/AF %INFO/ANN\\n' ~/Projects/PacHerring/Output/COV/COVscan_3pop/cutoff/Anno.",fname,".vcf > ~/Projects/PacHerring/Output/COV/COVscan_3pop/cutoff/",fname,"_annotation \n\n"))

}
sink(NULL)  


## for 10k pad vcf

#create a bash script to run snpEff
vfiles<-list.files("../Output/COV/COVscan_3pop/cutoff/", pattern="10kpad.recode.vcf")

sink("~/programs/snpEff/runsnpEff_cov_3pop_cutoff_10kpad.sh")
cat("#!/bin/bash \n\n")
for (i in 1:length(vfiles)){
    fname<-gsub(".recode.vcf","",vfiles[i])
    cat(paste0("java -Xmx8g -jar snpEff.jar Ch_v2.0.2.99 ~/Projects/PacHerring/Output/COV/COVscan_3pop/cutoff/",vfiles[i], " -stats ~/Projects/PacHerring/Output/COV/COVscan_3pop/cutoff/",fname,".html >  ~/Projects/PacHerring/Output/COV/COVscan_3pop/cutoff/Anno.",fname,".vcf \n"))
    
    #extract the annotation information
    cat(paste0("bcftools query -f '%CHROM %POS %INFO/AF %INFO/ANN\\n' ~/Projects/PacHerring/Output/COV/COVscan_3pop/cutoff/Anno.",fname,".vcf > ~/Projects/PacHerring/Output/COV/COVscan_3pop/cutoff/",fname,"_annotation \n\n"))

}
sink(NULL)  



```

```{bash eval=FALSE, include=FALSE}
cd ~/programs/snpEff
bash runsnpEff_cov_3pop_cutoff.sh
```



## Create summary gene files from snpEff and check overlapping genes.

```{r eval=FALSE, message=FALSE, warning=FALSE}
## Create summary files of snpEff results (gene annotations in the regions of interest) and reformat as a ShinyGo input 

#create gene list 
gfiles<-list.files("../Output/COV/COVscan_3pop/cutoff/", pattern="genes.txt")

for (i in 1:length(gfiles)){
    df<-read.table(paste0("../Output/COV/COVscan_3pop/cutoff/",gfiles[i]), sep="\t")
    df<-df[,1:7]
    colnames(df)<-c("GeneName","GeneId","TranscriptId","BioType","variants_impact_HIGH","variants_impact_LOW",	"variants_impact_MODERATE")
    
    fname<-gsub(".genes.txt","",gfiles[i])
    genes<-unique(df$GeneId)
    sink(paste0("../Output/COV/COVscan_3pop/cutoff/geneIDlist_",fname,".txt"))
    cat(paste0(genes,"; "))
    sink(NULL)
}

#Annotation infor from SnpEff
cv<-c("cov12","cov13","cov23")
for (c in 1:3){
    if (c!=3){
    for (p in 1:2){
        ano<-read.table(paste0("../Output/COV/COVscan_3pop/cutoff/",pops[p],"_outliers_",cv[c],"_annotation"), header = F)
        annotations<-data.frame()
        for (i in 1: nrow(ano)){
            anns<-unlist(strsplit(ano$V4[i], "\\,|\\|"))
            annm<-data.frame(matrix(anns,ncol = 16, byrow = TRUE))
            annm<-annm[,c(2,3,4,5,8)]
            colnames(annm)<-c("Effect","Putative_impact","Gene_name","Gene_ID","Feature type")
            annm<-annm[!duplicated(annm), ]
            annm$chr<-ano$V1[i]
            annm$pos<-ano$V2[i]
            annm$AF<- ano$V3[i]
            annotations<-rbind(annotations, annm)
        }     
        annotations<-annotations[,c(6:8,1:5)]
        annotations<-annotations[!duplicated(annotations[,1:2]),]
        write.csv(annotations, paste0("../Output/COV/COVscan_3pop/cutoff/Genes_",pops[p],"_outliers_100k_",cv[c],".csv"), row.names = F)
    }
    }
    if (c==3){
        for (p in 1:3){
        ano<-read.table(paste0("../Output/COV/COVscan_3pop/cutoff/",pops[p],"_outliers_",cv[c],"_annotation"), header = F)
        annotations<-data.frame()
        for (i in 1: nrow(ano)){
            anns<-unlist(strsplit(ano$V4[i], "\\,|\\|"))
            annm<-data.frame(matrix(anns,ncol = 16, byrow = TRUE))
            annm<-annm[,c(2,3,4,5,8)]
            colnames(annm)<-c("Effect","Putative_impact","Gene_name","Gene_ID","Feature type")
            annm<-annm[!duplicated(annm), ]
            annm$chr<-ano$V1[i]
            annm$chr<-ano$V1[i]
            annm$pos<-ano$V2[i]
            annm$AF<- ano$V3[i]
            annotations<-rbind(annotations, annm)
        }     
        annotations<-annotations[,c(6:8,1:5)]
        annotations<-annotations[!duplicated(annotations[,1:2]),]
        write.csv(annotations, paste0("../Output/COV/COVscan_3pop/cutoff/Genes_",pops[p],"_outliers_100k_",cv[c],".csv"), row.names = F)
    }
}
  
}

```


### for 10k pad data

```{r eval=FALSE, message=FALSE, warning=FALSE}
## Create summary files of snpEff results (gene annotations in the regions of interest) and reformat as a ShinyGo input 

#create gene list 
gfiles<-list.files("../Output/COV/COVscan_3pop/cutoff/", pattern="10kpad.genes.txt")
for (i in 1:length(gfiles)){
    df<-read.table(paste0("../Output/COV/COVscan_3pop/cutoff/",gfiles[i]), sep="\t")
    df<-df[,1:7]
    colnames(df)<-c("GeneName","GeneId","TranscriptId","BioType","variants_impact_HIGH","variants_impact_LOW",	"variants_impact_MODERATE")
    
    fname<-gsub(".genes.txt","",gfiles[i])
    genes<-unique(df$GeneId)
    sink(paste0("../Output/COV/COVscan_3pop/cutoff/geneIDlist_",fname,".txt"))
    cat(paste0(genes,"; "))
    sink(NULL)
}


### no enrichment found from 10k pad (PWS-cov12)
```






## Find the overlapping gene names 

```{r eval=FALSE, message=FALSE, warning=FALSE}

gnamesfiles<-list.files("../Output/COV/COVscan_3pop/cutoff/", pattern="Genes_.+outliers_100k.+\\d.csv$")

for (i in 1:length(gnamesfiles)){
    df<-read.csv(paste0("../Output/COV/COVscan_3pop/cutoff/",gnamesfiles[i]))
    df<-df[,c(1,6:7)]
    df<-df[!duplicated(df),]
    
    fname<-gsub(".csv","", gnamesfiles[i])
    fname<-gsub("Genes_","", fname)
    
    
    #add gene names for front and back of intergenic regions
    df2<-df[grep("-", df$Gene_ID),]
    k=1
    df_div<-data.frame()
    oddnames<-data.frame()
    for (j in 1:nrow(df2)){
        names<-unlist(strsplit(df2$Gene_name[j], "-"))
        ids<-unlist(strsplit(df2$Gene_ID[j], "-"))
        
        if (length(names)==2){
            df_div<-rbind(df_div, c(df2$chr[j],names[1],ids[1]))
            k=k+1
            df_div<-rbind(df_div, c(df2$chr[j],names[2],ids[2]))
            k=k+1
        }
       
        if (length(names)!=2){
            n<-grep("si:", names)
            if (length(n)>0){
                if (n==1) newnames<-c(paste0(names[1],"-",names[2]), names[3])
                if (n==2) newnames<-c(names[1],paste0(names[2],"-",names[3]))
                df_div<-rbind(df_div, c(df2$chr[j],newnames[1],ids[1]))
                k=k+1
                df_div<-rbind(df_div, c(df2$chr[j],newnames[2],ids[2]))
                k=k+1
            }
            
            if (length(n)==0) {
                oddnames<-rbind(oddnames, df2[j,])
            }
        }
    }
    df_div<-df_div[!duplicated(df_div),]
    df_div<-df_div[df_div$Gene_ID!="CHR_END",]
    df_div<-df_div[df_div$Gene_ID!="CHR_START",]
    
    remove<-grep("-", df$Gene_ID)
    df<-df[-remove,]
    df<-rbind(df, df_div)
    df<-df[!duplicated(df),]
    
    if (nrow(oddnames)!=0){
        write.csv(df, paste0("../Output/COV/COVscan_3pop/cutoff/",fname,"GeneList_withIntergenicGenes.csv" ), row.names = F)
        write.csv(oddnames, paste0("../Output/COV/COVscan_3pop/cutoff/Oddnames_", fname,".csv"))
    }
    if (nrow(oddnames)==0){
        write.csv(df, paste0("../Output/COV/COVscan_3pop/cutoff/",fname,"GeneList_withIntergenicGenes_new.csv" ), row.names = F)
     }
}
   

## !! ##
## Manually change the oddnames and add them to the GeneList (XX.GeneList_withIntergenicGenes.csv) files 
# Add the gene IDs to geneIDlist_XX_outliers_covxx.txt files as well -only one needs to be updated is TB cov13
#(updated file names has "_new" at the end)

#aggregate all gene names
gnew<-list.files("../Output/COV/COVscan_3pop/", pattern="GeneList_withIntergenicGenes_new.csv$")
Genes<-data.frame()
GeneList<-list()
for (i in 1:length(gnew)){
    df<-read.csv(paste0("../Output/COV/COVscan_3pop/", gnew[i]))
    GeneList[[i]]<-df
    fname<-gsub("GeneList_withIntergenicGenes_new.csv",'',gnew[i])
    names(GeneList)[i]<-fname
    dup<-df[duplicated(df),]
    df<-df[!duplicated(df),]
    Genes<-rbind(Genes, df)
    Genes<-Genes[!duplicated(Genes),]
    
}


#1. Between populations
times<-c("cov12","cov13","cov23")
common<-list()
common_summary<-data.frame(time=times)
for (i in 1:3){
    tlist<-GeneList[grep(times[i], names(GeneList))]
    if (i !=3){
        common_genes<-intersect(tlist[[1]]["Gene_name"], tlist[[2]]["Gene_name"])
        common[[i]]<-common_genes
        names(common)[[i]]<-times[i]
        common_summary$PWS[i]<-nrow(tlist[[grep("PWS", names(tlist))]])
        common_summary$TB[i]<-nrow(tlist[[grep("TB", names(tlist))]])
        common_summary$SS[i]<-NA
        common_summary$common_PWS.TB[i]<-nrow(common_genes)
        
        pws<-tlist[[1]]["Gene_name"]
        tb<-tlist[[2]]["Gene_name"]
        x<-list(PWS=pws$Gene_name,TB=tb$Gene_name)
        ggvenn(x, fill_color = cols[c(2,1)], stroke_size = 0.5, set_name_size = 4,text_size=3)+ggtitle(times[i])
        ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/Venn_",times[i],".png"), width = 3, height=3, dpi=300)
    }
    if (i==3){
        common_summary$PWS[i]<-nrow(tlist[[grep("PWS", names(tlist))]])
        common_summary$TB[i]<- nrow(tlist[[grep("TB", names(tlist))]])
        common_summary$SS[i]<- nrow(tlist[[grep("SS", names(tlist))]])
        
        genes1<-intersect(tlist[[1]]["Gene_name"], tlist[[3]]["Gene_name"])
        genes2<-intersect(tlist[[1]]["Gene_name"], tlist[[2]]["Gene_name"])
        genes3<-intersect(tlist[[2]]["Gene_name"], tlist[[3]]["Gene_name"])
        genes4<-intersect(tlist[[1]]["Gene_name"],intersect(tlist[[2]]["Gene_name"], tlist[[3]]["Gene_name"]))
        common_summary$common_PWS.TB[i]<-nrow(genes1)
        common_summary$common_PWS.SS[i]<-nrow(genes2)
        common_summary$common_SS.TB[i]<-nrow(genes3)
        common_summary$common3[i]<-nrow(genes4)
        k=i
        common[[k]]<-genes2
        names(common)[[k]]<-paste0(times[i],"_PWS.SS")
        k=k+1
        common[[k]]<-genes1
        names(common)[[k]]<-paste0(times[i],"_PWS.TB")
        k=k+1
        common[[k]]<-genes3
        names(common)[[k]]<-paste0(times[i],"_SS.TB")
        k=k+1
        common[[k]]<-genes4
        names(common)[[k]]<-paste0(times[i],"_3pops")
        
        pws<-tlist[[1]]["Gene_name"]
        tb<-tlist[[3]]["Gene_name"]
        ss<-tlist[[2]]["Gene_name"]
        x<-list(PWS=pws$Gene_name,TB=tb$Gene_name, SS=ss$Gene_name)
        ggvenn(x, fill_color = cols[c(2,1,3)], stroke_size = 0.5, set_name_size = 4,text_size=3)+ggtitle(times[i])
        ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/Venn_",times[i],".png"), width = 4, height=4, dpi=300)
        
         x1<-list(PWS=pws$Gene_name,TB=tb$Gene_name)
        ggvenn(x1, fill_color = cols[c(2,1)], stroke_size = 0.5, set_name_size = 4,text_size=3)+ggtitle(times[i])
        ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/Venn_PWS_TB_",times[i],".png"), width = 3, height=3, dpi=300)
         x2<-list(PWS=pws$Gene_name,SS=ss$Gene_name)
        ggvenn(x2, fill_color = cols[c(2,3)], stroke_size = 0.5, set_name_size = 4,text_size=3)+ggtitle(times[i])
        ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/Venn_PWS_SS_",times[i],".png"), width = 3, height=3, dpi=300)
          x3<-list(SS=ss$Gene_name, TB=tb$Gene_name)
        ggvenn(x3, fill_color = cols[c(3,1)], stroke_size = 0.5, set_name_size = 4,text_size=3)+ggtitle(times[i])
        ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/Venn_SS_TB_",times[i],".png"), width = 3, height=3, dpi=300)
        
        
        }
}
write.csv(common_summary, "../Output/COV/COVscan_3pop/cutoff/Common_genes_withIntergenes_3pops.csv")


#What are the overlapping gene names between populations
common_times<-list()
for (i in 1: length(common)){
    gids<-common[[i]]
    df<-data.frame(Gene_name=gids)
    
    df2<-merge(df, Genes, by="Gene_name")
    write.csv(df2, paste0("../Output/COV/COVscan_3pop/cutoff/Common_genes_", names(common)[i],".csv"), row.names = F)
    common_times[[i]]<-df2
    names(common_times)[i]<- names(common)[i]
}


#overlapping genes COV12

tlist<-GeneList[grep(times[1], names(GeneList))]
genes1<-intersect(tlist[[1]]["Gene_ID"], tlist[[2]]["Gene_ID"]) #common genes between PWS and TB in cov12
g1<-unique(genes1$Gene_ID)
sink("../Output/COV/COVscan_3pop/cutoff/overlapping_genes/geneIDs_PWS_TB_cov12.txt")
cat(paste0(g1, ";"))
sink(NULL)

# Overlapping genes COV13

tlist<-GeneList[grep(times[2], names(GeneList))]
genes1<-intersect(tlist[[1]]["Gene_ID"], tlist[[2]]["Gene_ID"]) #common genes between PWS and TB in cov12
g1<-unique(genes1$Gene_ID)
sink("../Output/COV/COVscan_3pop/cutoff/overlapping_genes/geneIDs_PWS_TB_cov13.txt")
cat(paste0(g1, ";"))
sink(NULL)




#overlapping genes COV23

tlist<-GeneList[grep(times[3], names(GeneList))]
pws23<-tlist[[1]]["Gene_ID"]
ss23<-tlist[[2]]["Gene_ID"]
tb23<-tlist[[3]]["Gene_ID"]
genes1<-intersect(tlist[[1]]["Gene_ID"], tlist[[3]]["Gene_ID"]) #common genes between PWS and TB in cov23
genes2<-intersect(tlist[[1]]["Gene_ID"], tlist[[2]]["Gene_ID"]) #common genes between PWS and SS in cov23
genes3<-intersect(tlist[[2]]["Gene_ID"], tlist[[3]]["Gene_ID"]) #common genes between SS and TB in cov23
genes4<-intersect(tlist[[1]]["Gene_ID"],intersect(tlist[[2]]["Gene_ID"], tlist[[3]]["Gene_ID"])) # Common genes in all 3 populations
g1<-unique(genes1$Gene_ID)
sink("../Output/COV/COVscan_3pop/cutoff/overlapping_genes/geneIDs_PWS_TB_cov23.txt")
cat(paste0(g1, ";"))
sink(NULL)

g2<-unique(genes2$Gene_ID)
sink("../Output/COV/COVscan_3pop/cutoff/overlapping_genes/geneIDs_PWS_SS_cov23.txt")
cat(paste0(g2, ";"))
sink(NULL)

g3<-unique(genes3$Gene_ID)
sink("../Output/COV/COVscan_3pop/cutoff/overlapping_genes/geneIDs_SS_TB_cov23.txt")
cat(paste0(g3, ";"))
sink(NULL)

g4<-unique(genes4$Gene_ID)
sink("../Output/COV/COVscan_3pop/cutoff/overlapping_genes/geneIDs_all3_cov23.txt")
cat(paste0(g4, ";"))
sink(NULL)



#2. Between Time-Points within a population

times<-c("cov12","cov13","cov23")
pops<-c("PWS","TB")
common2<-list()
common_summary2<-data.frame(pop=rep(pops[1:2], each=4))
for (i in 1:length(pops)){
    plist<-GeneList[grep(pops[i], names(GeneList))]
    k=4*i-3
    #common genes between COV12 and COV13
    common_genes1<-intersect(plist[[1]]["Gene_name"], plist[[2]]["Gene_name"])
    common2[[k]]<-common_genes1
    names(common2)[[k]]<-paste0(pops[i],".", times[1],"_",times[2])
    common_summary2$Time[k]<-paste0(times[1],"_",times[2])
    common_summary2$no.of.genes[k]<-nrow(common_genes1) 
    
    c12<-plist[[1]]["Gene_name"]
    c13<-plist[[2]]["Gene_name"]
    c23<-plist[[3]]["Gene_name"]
    x<-list(COV12=c12$Gene_name,COV13=c13$Gene_name, COV23=c23$Gene_name)
    ggvenn(x, fill_color = cols[c(1,5,7)], stroke_size = 0.5, set_name_size = 4,text_size=3)+ggtitle(pops[i])
    ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/Venn_",pops[i],".png"), width = 4, height=4, dpi=300)

    
    k=k+1
    #common genes between COV12 and COV23
    common_genes2<-intersect(plist[[1]]["Gene_name"], plist[[3]]["Gene_name"])
    common2[[k]]<-common_genes2
    names(common2)[[k]]<-paste0(pops[i],".", times[1],"_",times[3])
    common_summary2$Time[k]<-paste0(times[1],"_",times[3])
    common_summary2$no.of.genes[k]<-nrow(common_genes2) 
 
    k=k+1
    #common genes between COV13 and COV23
    common_genes3<-intersect(plist[[2]]["Gene_name"], plist[[3]]["Gene_name"])
    common2[[k]]<-common_genes3
    names(common2)[[k]]<-paste0(pops[i],".", times[2],"_",times[3])
    common_summary2$Time[k]<-paste0(times[2],"_",times[3])
    common_summary2$no.of.genes[k]<-nrow(common_genes3) 
 
    k=k+1
    #common genes among all time periods
    common_genes4<-intersect(plist[[1]]["Gene_name"], (intersect(plist[[2]]["Gene_name"], plist[[3]]["Gene_name"])))
    common2[[k]]<-common_genes4
    names(common2)[[k]]<-paste0(pops[i],".all")
    common_summary2$Time[k]<-"All"
    common_summary2$no.of.genes[k]<-nrow(common_genes4) 
}
write.csv(common_summary2, "../Output/COV/COVscan_3pop/cutoff/Common_genes_betweenTimePoints.csv")


#Common gene names between time points

for (i in 1:2){
    CommonGenes<-data.frame()
    glist<-common2[grep(pops[i], names(common2))]
    for(j in 1:length(glist)){
        gids<-glist[[j]]
        df<-data.frame(Gene_name=gids)
        df2<-merge(df, Genes, by="Gene_name", all.x=T)
        write.csv(df2, paste0("../Output/COV/COVscan_3pop/cutoff/Common_genes_", names(glist)[j],".csv"), row.names = F)
        df2$Time<-names(glist)[j]
        CommonGenes<-rbind(CommonGenes, df2)
    }
    write.csv(CommonGenes, paste0("../Output/COV/COVscan_3pop/cutoff/Common_genes_",pops[i] ,".csv"), row.names = F)
}



```



#### Overlapping gene numbers   
```{r echo=TRUE, message=FALSE, warning=FALSE}

# Summary table
common_genes<-read.csv("../Output/COV/COVscan_3pop/cutoff/Common_genes_withIntergenes_3pops.csv", row.names = 1)
knitr::kable(common_genes)

```


## What are the genes overlapping across different time points between populations? 

```{r eval=FALSE, message=FALSE, warning=FALSE}
## Between PWS and TB
pws.tb<-common_times[c(1,2,4)]

# 1. Common genes between populations across time points in PWS and TB (COV12 - COV13)
genes1213<-intersect(pws.tb[[1]]["Gene_name"], pws.tb[[2]]["Gene_name"])
genes1213<-merge(genes1213, Genes, by="Gene_name")
write.csv(genes1213, "../Output/COV/COVscan_3pop/cutoff/Common_genes_PWS.TB.cov12-cov23.csv")
#           Gene_name   chr            Gene_ID
#1 ENSCHAG00000001687 chr13 ENSCHAG00000001687
#2             ndst2a chr13 ENSCHAG00000002649
#3             zswim8 chr13 ENSCHAG00000005956

#common gene names
p1213<-pws.tb[[1]]["Gene_name"]
t1213<-pws.tb[[2]]["Gene_name"]
x<-list(PWS=p1213$Gene_name,TB=t1213$Gene_name)
ggvenn(x, fill_color = cols[c(2,1)], stroke_size = 0.5, set_name_size = 4,text_size=3)+ggtitle("COV12-COV13 in PWS & TB")
ggsave(paste0("../Output/COV/COVscan_3pop/Venn_PWS_TB_COV12-COV13.png"), width = 3, height=3, dpi=300)
        

# 2. Common genes between populations across time points in PWS and TB (COV12 - COV23)

genes1223<-intersect(pws.tb[[1]]["Gene_name"], pws.tb[[3]]["Gene_name"])
genes1223<-merge(genes1223, Genes, by="Gene_name")
write.csv(genes1223, "../Output/COV/COVscan_3pop/cutoff/Common_genes_PWS.TB.cov12-cov13.csv")
#           Gene_name   chr            Gene_ID
#1 ENSCHAG00000001687 chr13 ENSCHAG00000001687
#2 ENSCHAG00000022709 chr20 ENSCHAG00000022709
#3 ENSCHAG00000022815 chr20 ENSCHAG00000022815
#4             ndst2a chr13 ENSCHAG00000002649

p1223<-pws.tb[[1]]["Gene_name"]
t1223<-pws.tb[[3]]["Gene_name"]
x<-list(PWS=p1223$Gene_name,TB=t1223$Gene_name)
ggvenn(x, fill_color = cols[c(2,1)], stroke_size = 0.5, set_name_size = 4,text_size=3)+ggtitle("COV12-COV23 in PWS & TB")
ggsave(paste0("../Output/COV/COVscan_3pop/cutoff/Venn_PWS_TB_COV12-COV23.png"), width = 3, height=3, dpi=300)


# 3. between PWS and TB across COV13 and COV23
genes1323<-intersect(pws.tb[[2]]["Gene_name"], pws.tb[[3]]["Gene_name"])
genes1323<-merge(genes1323, Genes, by="Gene_name")
write.csv(genes1323, "../Output/COV/COVscan_3pop/cutoff/ommon_genes_PWS.TB.cov13-cov23.csv")
#           Gene_name   chr            Gene_ID
#1 ENSCHAG00000001687 chr13 ENSCHAG00000001687
#2             ndst2a chr13 ENSCHAG00000002649

p1323<-pws.tb[[2]]["Gene_name"]
t1323<-pws.tb[[3]]["Gene_name"]
x<-list(PWS=p1323$Gene_name,TB=t1323$Gene_name)
ggvenn(x, fill_color = cols[c(2,1)], stroke_size = 0.5, set_name_size = 4,text_size=3)+ggtitle("COV13-COV23 in PWS & TB")
ggsave("../Output/COV/COVscan_3pop/cutoff/Venn_PWS_TB_COV13-COV23.png", width = 3, height=3, dpi=300)




```

* Numbers of overlapping genes between populations between time points  
![](../Output/COV/COVscan_3pop/cutoff/Venn_PWS_TB_COV12-COV23.png)

![](../Output/COV/COVscan_3pop/cutoff/Venn_PWS_TB_COV13-COV23.png)




# Interpopulation comparison per time period 
```{r eval=FALSE, message=FALSE, warning=FALSE}
### Interpopulation comparisons
#decode the samples to create the right matrix
cv<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_Covs_interPopulations_100k_3pops.csv", header = F)
labs<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_Covs_interPopulations_100k_labels_3pops.csv" )
labs<-labs[,-1]
cvm<-data.frame(label=as.vector(t(labs)), cov=as.vector(t(cv)))

#rearrange based on comparions: covariance between populations within the same period
#PopYr Symbols
# PH 1 'PWS', 1991
# PH 2 'PWS', 1996
# PH 3 'PWS', 2006
# PH 4 'PWS', 2017
# PH 5 'SS',  1991
# PH 6 'SS',  1996
# PH 7 'SS',  2006
# PH 8 'SS',  2017
# PH 9 'TB',  1991
# PH 10'TB',  1996
# PH 11'TB',  2006
# PH 12'TB',  2017

Covs<-data.frame(pops=rep(c("PWS.vs.SS", "PWS.vs.TB",  "SS.vs.TB"), times=6),
                 period=c(rep("1991-1996", times=3),rep("1996-2006", times=3), rep("2006-2017", times=3)))

Covs$cov<-c(NA, cvm$cov[cvm$label=="cov(PH: 2-1, PH: 10-9)"],NA,
            cvm$cov[cvm$label=="cov(PH: 3-2, PH: 7-6)"],cvm$cov[cvm$label=="cov(PH: 3-2, PH: 11-10)"], 
            cvm$cov[cvm$label=="cov(PH: 7-6, PH: 11-10)"],
            cvm$cov[cvm$label=="cov(PH: 4-3, PH: 8-7)"],cvm$cov[cvm$label=="cov(PH: 4-3, PH: 12-11)"],cvm$cov[cvm$label=="cov(PH: 8-7, PH: 12-11)"])



#C.I.
cis<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_COV_Interpop_comparison_CIs.csv")
cis<-cis[,-1]
cim<-data.frame(label=as.vector(t(labs)), ci_l=as.vector(t(cis[1:11,])))
cim$ci_h<-as.vector(t(cis[12:22,]))

Covs$ci_l<-as.numeric(c(NA,cim$ci_l[cim$label=="cov(PH: 2-1, PH: 10-9)"],NA,
                      cim$ci_l[cim$label=="cov(PH: 3-2, PH: 7-6)"],cim$ci_l[cim$label=="cov(PH: 3-2, PH: 11-10)"], cim$ci_l[cim$label=="cov(PH: 7-6, PH: 11-10)"],
                      cim$ci_l[cim$label=="cov(PH: 4-3, PH: 8-7)"],cim$ci_l[cim$label=="cov(PH: 4-3, PH: 12-11)"], cim$ci_l[cim$label=="cov(PH: 8-7, PH: 12-11)"]))

Covs$ci_h<-as.numeric(c(NA, cim$ci_h[cim$label=="cov(PH: 2-1, PH: 10-9)"],NA,
                      cim$ci_h[cim$label=="cov(PH: 3-2, PH: 7-6)"],cim$ci_h[cim$label=="cov(PH: 3-2, PH: 11-10)"], cim$ci_h[cim$label=="cov(PH: 7-6, PH: 11-10)"],
                      cim$ci_h[cim$label=="cov(PH: 4-3, PH: 8-7)"],cim$ci_h[cim$label=="cov(PH: 4-3, PH: 12-11)"], cim$ci_h[cim$label=="cov(PH: 8-7, PH: 12-11)"]))


library(RColorBrewer)
display.brewer.all(type="qual")

colors2<-brewer.pal(n=8, "Set3")
"#8DD3C7" "#FFFFB3" "#BEBADA" "#FB8072" "#80B1D3" "#FDB462" "#B3DE69" "#FCCDE5"

#Barplot
ggplot(Covs, aes(x=period, y=cov, fill=pops))+
    geom_bar(stat="identity",position=position_dodge(width = 0.7), width=0.8)+
    ylab("Covariance")+xlab('')+theme_classic()+
    geom_hline(yintercept = 0,color="gray70", size=0.3)+
    scale_fill_manual(values=colors2[c(4,1,3)])+
    theme(legend.title = element_blank())+
    scale_y_continuous(labels = comma)+
    ylim(-0.0013, 0.002)+
    geom_vline(xintercept = c(1.5,2.5), color="gray", size=0.2)+
    geom_errorbar(aes(ymin=ci_l, ymax=ci_h), width=.2, size=.2, position=position_dodge(width = 0.7))
ggsave("../Output/COV/Interpop_cov_comparison_3Pops_new.png",width = 4.8, height = 3, dpi=300)

#Point plot
ggplot(Covs, aes(x=period, y=cov, color=pops))+
    geom_point(position=position_dodge(width = 0.7), size=3)+
    ylab("Covariance")+xlab('')+theme_classic()+
    geom_hline(yintercept = 0,color="gray70", size=0.3)+
    scale_color_manual(values=colors2[c(4,1,3)])+
    theme(legend.title = element_blank())+
    scale_y_continuous(labels = comma)+
    geom_errorbar(aes(ymin=ci_l, ymax=ci_h), width=.2, size=.2, position=position_dodge(width = 0.7))+
    ylim(-0.0023, 0.002)+
     geom_vline(xintercept = c(1.5,2.5), color="gray", size=0.3)
ggsave("../Output/COV/Interpop_cov_comparison_3Pops_new_PointPlot.png",width = 4.8, height = 3, dpi=300)



#line plot
Covs$time<-1
Covs$time[Covs$period=="1996-2006"]<-2
Covs$time[Covs$period=="2006-2017"]<-3
Covs<-Covs[order(Covs$time),]
ggplot(Covs, aes(x=time, y=cov, color=pops, group=pops))+
    geom_point(position=position_dodge(width = 0.7), size=4)+
    geom_path(position=position_dodge(width = 0.7))+
    ylab("Covariance")+xlab('')+theme_classic()+
    geom_hline(yintercept = 0,color="gray70", size=0.3)+
    scale_color_manual(values=colors2[c(4,1,3)])+
    theme(legend.title = element_blank())+
    scale_y_continuous(labels = comma)+
    geom_errorbar(aes(ymin=ci_l, ymax=ci_h), width=.2, size=.2, position=position_dodge(width = 0.7))+
    ylim(-0.0023, 0.002)+
     geom_vline(xintercept = c(1.5,2.5), color="gray", size=0.2)+
    scale_x_continuous(breaks=c(1,2,3), labels = c("1991-1996","1996-2006","2006-2017"))
ggsave("../Output/COV/Interpop_cov_comparison_3Pops_new_LinePlot.png",width = 4.8, height = 3, dpi=300)

```

![](../Output/COV/Interpop_cov_comparison_3Pops_new.png)


![](../Output/COV/Interpop_cov_comparison_3Pops_new_LinePlot.png)

## Longer time period  
```{r eval=FALSE, message=FALSE, warning=FALSE}
## Longer time-period
Covs2<-data.frame(pops=rep(c("PWS.vs.SS", "PWS.vs.TB",  "SS.vs.TB"), times=3),
                 period=c(rep("1991-2006", times=3),rep("1991-2017", times=3),rep("1996-2017", times=3)))

cv1<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_Covs_interPopulations_100k_1991-2006.csv", header = F)
labs1<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_Covs_interPopulations_100k_labels_1991-2006.csv" )
labs1<-labs1[,-1]
cvm1<-data.frame(label=as.vector(t(labs1)), cov=as.vector(t(cv1)))

cv2<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_Covs_interPopulations_100k_1991-2017.csv", header = F)
labs2<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_Covs_interPopulations_100k_labels_1991-2017.csv" )
labs2<-labs2[,-1]
cvm2<-data.frame(label=as.vector(t(labs2)), cov=as.vector(t(cv2)))

cv3<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_Covs_interPopulations_100k_1996-2017.csv", header = F)
labs3<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_Covs_interPopulations_100k_labels_1996-2017.csv" )
labs3<-labs3[,-1]
cvm3<-data.frame(label=as.vector(t(labs3)), cov=as.vector(t(cv3)))

Covs2$cov<-c(NA, cvm1$cov[cvm1$label=="cov(PH: 2-1, PH: 4-3)"], NA,
             NA, cvm2$cov[cvm2$label=="cov(PH: 2-1, PH: 4-3)"], NA,
             cvm3$cov[cvm3$label=="cov(PH: 2-1, PH: 4-3)"], cvm3$cov[cvm3$label=="cov(PH: 2-1, PH: 6-5)"], cvm3$cov[cvm3$label=="cov(PH: 4-3, PH: 6-5)"])

#C.I.
cis1<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_COV_Interpop_comparison_CIs_1991-2006.csv")
cis1<-cis1[,-1]
cis2<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_COV_Interpop_comparison_CIs_1991-2017.csv")
cis2<-cis2[,-1]
cis3<-read.csv("~/Projects/Pacherring_Vincent/MD7000/GW_COV_Interpop_comparison_CIs_1996-2017.csv")
cis3<-cis3[,-1]

#cim<-data.frame(label=as.vector(t(labs)), ci_l=as.vector(t(cis1[1:4,])))
#cim$ci_h<-as.vector(t(cis[12:22,]))

Covs2$ci_l<-as.numeric(c(NA,cis1[1,3],NA,
                        NA,cis2[1,3],NA,
                      cis3[1,3],cis3[1,5],cis3[3,5]))

Covs2$ci_h<-as.numeric(c(NA,cis1[4,3],NA,
                        NA,cis2[4,3],NA,
                      cis3[6,3],cis3[6,5],cis3[8,5]))


ggplot(Covs2, aes(x=period, y=cov, fill=pops))+
    geom_bar(stat="identity",position=position_dodge(width = 0.7), width=0.8)+
    ylab("Covariance")+xlab('')+theme_classic()+
    geom_hline(yintercept = 0,color="gray70", size=0.3)+
    scale_fill_manual(values=colors2[c(4,1,3)])+
    theme(legend.title = element_blank())+scale_y_continuous(labels = comma)+
    ylim(-0.0013, 0.002)+geom_vline(xintercept = c(1.5,2.5), color="gray", size=0.2)+
    geom_errorbar(aes(ymin=ci_l, ymax=ci_h), width=.2, size=.2, position=position_dodge(width = 0.7))
ggsave("../Output/COV/Interpop_cov_comparison_3Pops_LonogerPeriod.png",width = 4.9, height = 3, dpi=300)

ggplot(Covs2, aes(x=period, y=cov, color=pops))+
    geom_point(position=position_dodge(width = 0.7), size=4)+
    ylab("Covariance")+xlab('')+theme_classic()+
    geom_hline(yintercept = 0,color="gray70", size=0.3)+
    scale_color_manual(values=colors2[c(4,1,3)])+
    theme(legend.title = element_blank())+
    scale_y_continuous(labels = comma)+
     ylim(-0.0013, 0.002)+
    geom_errorbar(aes(ymin=ci_l, ymax=ci_h), width=.2, size=.2, position=position_dodge(width = 0.7))+
    geom_vline(xintercept = c(1.5,2.5), color="gray", size=0.3)
ggsave("../Output/COV/Interpop_cov_comparison_3PopsLonogerPeriod_PointPlot.png",width = 4.7, height = 3, dpi=300)


```
![](../Output/COV/Interpop_cov_comparison_3PopsLonogerPeriod_PointPlot.png)

# Focused freq analysis

## ccr6a (chr15: 16,066,502 - 16,091,639)
```{r eval=FALSE, message=FALSE, warning=FALSE}
pops<-c("PWS91","PWS96","PWS07","PWS17")
yr<-c(1991,1996,2007,2017)
maf<-data.frame()
for (i in 1:4){
    af<-read.table(paste0("../Data/new_vcf/AF/",pops[i],".mafs"),sep="\t", header = T)
    af<-af[af$chromo=="chr15"&af$position>=16050000&af$position<=16100000,]
    af$year<-yr[i]
    maf<-rbind(maf,af)
}
write.csv(maf,"../Output/COV/COVscan_3pop/cutoff/ccrc6_MAFchange_chr15_16Mb.csv")

positions<-unique(maf$position)
for (i in 1:length(positions)){
    df<-maf[maf$position==positions[i],]
    #AF both decreased
    if (df$knownEM[df$year==1991]>df$knownEM[df$year==1996] & df$knownEM[df$year==1996]>df$knownEM[df$year==2007]){
        maf$trend[maf$position==positions[i]]<-"down"
    }
    else if (df$knownEM[df$year==1991]<df$knownEM[df$year==1996] & df$knownEM[df$year==1996]<df$knownEM[df$year==2007]){
        maf$trend[maf$position==positions[i]]<-"up"
    }
    
    else maf$trend[maf$position==positions[i]]<-"none"
}

write.csv(maf,"../Output/COV/COVscan_3pop/cutoff/ccrc6_MAFchange_chr15_16Mb.csv")

ggplot(maf, aes(x=year, y=knownEM, color=factor(position)))+
    geom_point(size=1.5)+
    geom_line(size=0.3)+ggtitle("CCRC6 gene AF changes")+
    ylab("maf")+
    theme(legend.title=element_blank())+
    theme_minimal()+theme(legend.text = element_text(size=5), legend.title=element_blank())
ggsave("../Output/COV/COVscan_3pop/cutoff/PWS_ccr6c_AFchange_ch15.png", width = 6, height=3, dpi=300)

#color by trend
maf$trend<-factor(maf$trend, levels=c("up","down","none"))
ggplot(maf, aes(x=year, y=knownEM, color=trend))+
    geom_point(size=1.5)+
    geom_path(aes(group=position), size=0.3)+ggtitle("CCRC6 gene AF changes 1991-2007")+
    ylab("maf")+
    theme(legend.title=element_blank())+
    theme_minimal()+theme(legend.title=element_blank())+
    scale_color_manual("Trend",values=c("deeppink2","royalblue", "gray"))
ggsave("../Output/COV/COVscan_3pop/cutoff/PWS_ccr6c_AFchange_ch15_trend.png", width = 6, height=3, dpi=300)

up<-maf[maf$trend=="up",]
down<-maf[maf$trend=="down",]
library(ggrepel)

# Plot separately

up<-up %>% mutate(label = if_else(year == max(year), as.character(position), NA_character_))
ggplot(up, aes(x=year, y=knownEM))+
    geom_point(size=1.5,color="deeppink2", alpha=0.8)+
    geom_path(aes(group=position), size=0.3,color="deeppink2" )+ggtitle(paste0("CCRC6 AF changes 1991-2007, Up ",length((unique(maf$position[maf$trend=="up"])))," loci"))+
    ylab("maf")+
    theme(legend.title=element_blank())+
    theme_minimal()+theme(legend.title=element_blank())+
    geom_label_repel(aes(label = label), label.size=0.1, size = 2,
                  nudge_x = 2,
                  na.rm = TRUE)
ggsave("../Output/COV/COVscan_3pop/cutoff/PWS_ccr6c_AFchange_ch15_Up.png", width = 6, height=3, dpi=300)

down<-down %>% mutate(label = if_else(year == max(year), as.character(position), NA_character_))
ggplot(down, aes(x=year, y=knownEM))+
    geom_point(size=1.5,color="royalblue", alpha=0.8)+
    geom_path(aes(group=position), size=0.3,color="royalblue" )+ggtitle(paste0("CCRC6 AF changes 1991-2007, Up ",length((unique(maf$position[maf$trend=="down"])))," loci"))+
    ylab("maf")+
    theme(legend.title=element_blank())+
    theme_minimal()+theme(legend.title=element_blank())+
     geom_label_repel(aes(label = label), label.size=0.1, size = 3,
                  nudge_x = 2,
                  na.rm = TRUE)
ggsave("../Output/COV/COVscan_3pop/cutoff/PWS_ccr6c_AFchange_ch15_down.png", width = 6, height=3, dpi=300)



```

![](../Output/COV/COVscan_3pop/cutoff/PWS_ccr6c_AFchange_ch15.png)
![](../Output/COV/COVscan_3pop/cutoff/PWS_ccr6c_AFchange_ch15_trend.png)

![](../Output/COV/COVscan_3pop/cutoff/PWS_ccr6c_AFchange_ch15_Up.png)

![](../Output/COV/COVscan_3pop/cutoff/PWS_ccr6c_AFchange_ch15_down.png)

### AF changes in all 3 pops
```{r eval=FALSE, message=FALSE, warning=FALSE}

###TB
pops<-c("TB91","TB96","TB06","TB17","PWS91","PWS96","PWS07","PWS17","SS96","SS06","SS17")
yr<-c(1991,1996,2006,2017,1991,1996,2007,2017,1996,2006,2017)
maf<-data.frame()
for (i in 1:length(pops)){
    af<-read.table(paste0("../Data/new_vcf/AF/",pops[i],".mafs"),sep="\t", header = T)
    af<-af[af$chromo=="chr15"&af$position>=16050000&af$position<=16100000,]
    af$year<-yr[i]
    af$pop<-sub("\\d\\d","", pops[i])
    maf<-rbind(maf,af)
}
#write.csv(maf,"../Output/COV/COVscan_3pop/AF_maf_chr13_23Mb.csv")


ggplot(maf, aes(x=year, y=knownEM, color=pop))+
    facet_wrap(~factor(position))+
    geom_point(size=1.5)+
    geom_path(linewidth=0.6)+ggtitle("CCRC6 gene (chr15)")+
    ylab("maf")+
    theme(legend.title=element_blank())+
    scale_color_manual(values=cols[c(2,3,1)])
ggsave("../Output/COV/COVscan_3pop/cutoff/ccrc6_AFchanges_allPops.png", width = 8, height=6, dpi=300)

```

![](../Output/COV/COVscan_3pop/cutoff/ccrc6_AFchanges_allPops.png)









```{r eval=FALSE, message=FALSE, warning=FALSE}




```

```{r eval=FALSE, message=FALSE, warning=FALSE}
```

